Unescape JSON string token before transcoding to UTF-16 and returning to the user...
authorAhson Khan <ahkha@microsoft.com>
Sat, 19 Jan 2019 14:37:57 +0000 (06:37 -0800)
committerGitHub <noreply@github.com>
Sat, 19 Jan 2019 14:37:57 +0000 (06:37 -0800)
* Unescape JSON string token before transcoding to UTF-16 and returning to
the user.

* Handle escaping and transcoding surrogate pairs correctly.

* Fix unescaping forward slash.

* Add a double high surrogate test.

* Factor out common helpers between the reader and writer.

* Fix exception messages.

* Fix build issues related to source package.

* Add comments regarding Debug.Asserts and fix InRange impl.

* Dont throw DecoderFallBackException and add invalid utf-8 string tests.

* Wrap the DecoderfallbackException within the InvalidOperationException.

Commit migrated from https://github.com/dotnet/corefx/commit/38e5e28646687da306ad1f3e3fc9876e67e031bb

12 files changed:
src/libraries/System.Text.Json/src/Resources/Strings.resx
src/libraries/System.Text.Json/src/System.Text.Json.csproj
src/libraries/System.Text.Json/src/System/Text/Json/JsonConstants.cs
src/libraries/System.Text.Json/src/System/Text/Json/JsonHelpers.cs [new file with mode: 0644]
src/libraries/System.Text.Json/src/System/Text/Json/Reader/JsonReaderHelper.Unescaping.cs [new file with mode: 0644]
src/libraries/System.Text.Json/src/System/Text/Json/Reader/JsonReaderHelper.cs
src/libraries/System.Text.Json/src/System/Text/Json/Reader/Utf8JsonReader.TryGet.cs
src/libraries/System.Text.Json/src/System/Text/Json/ThrowHelper.cs
src/libraries/System.Text.Json/src/System/Text/Json/Writer/JsonWriterHelper.Escaping.cs
src/libraries/System.Text.Json/src/System/Text/Json/Writer/JsonWriterHelper.Transcoding.cs
src/libraries/System.Text.Json/tests/Utf8JsonReaderTests.TryGet.cs
src/libraries/System.Text.Json/tests/Utf8JsonReaderTests.cs

index 2d1fe0a..d347f33 100644 (file)
   <data name="CallFlushToAvoidDataLoss" xml:space="preserve">
     <value>The JSON writer needs to be flushed before getting the current state. There are {0} bytes that have not been committed to the output.</value>
   </data>
+  <data name="CannotReadIncompleteUTF16" xml:space="preserve">
+    <value>Cannot read incomplete UTF-16 JSON text as string with missing low surrogate.</value>
+  </data>
+  <data name="CannotReadInvalidUTF16" xml:space="preserve">
+    <value>Cannot read invalid UTF-16 JSON text as string. Invalid surrogate value: '{0}'.</value>
+  </data>
   <data name="CannotStartObjectArrayAfterPrimitiveOrClose" xml:space="preserve">
     <value>Cannot write the start of an object/array after a single JSON value or outside of an existing closed object/array. Current token type is '{0}'.</value>
   </data>
   <data name="CannotStartObjectArrayWithoutProperty" xml:space="preserve">
     <value>Cannot write the start of an object or array without a property name. Current token type is '{0}'.</value>
   </data>
+  <data name="CannotTranscodeInvalidUtf8" xml:space="preserve">
+    <value>Cannot transcode invalid UTF-8 JSON text to UTF-16 string.</value>
+  </data>
   <data name="CannotWriteInvalidUTF16" xml:space="preserve">
-    <value>Cannot write invalid UTF-16 text as JSON. Invalid surrogate pair: '{0}'.</value>
+    <value>Cannot write invalid UTF-16 text as JSON. Invalid surrogate value: '{0}'.</value>
   </data>
   <data name="CannotWriteInvalidUTF8" xml:space="preserve">
     <value>Cannot write invalid UTF-8 text as JSON. Invalid input: '{0}'.</value>
index c4c1727..6d9dd46 100644 (file)
     <Compile Include="System\Text\Json\BitStack.cs" />
     <Compile Include="System\Text\Json\JsonCommentHandling.cs" />
     <Compile Include="System\Text\Json\JsonConstants.cs" />
+    <Compile Include="System\Text\Json\JsonHelpers.cs" />
     <Compile Include="System\Text\Json\JsonTokenType.cs" />
     <Compile Include="System\Text\Json\ThrowHelper.cs" />
     <Compile Include="System\Text\Json\Reader\ConsumeNumberResult.cs" />
     <Compile Include="System\Text\Json\Reader\ConsumeTokenResult.cs" />
     <Compile Include="System\Text\Json\Reader\JsonReaderException.cs" />
     <Compile Include="System\Text\Json\Reader\JsonReaderHelper.cs" />
+    <Compile Include="System\Text\Json\Reader\JsonReaderHelper.Unescaping.cs" />
     <Compile Include="System\Text\Json\Reader\JsonReaderOptions.cs" />
     <Compile Include="System\Text\Json\Reader\JsonReaderState.cs" />
     <Compile Include="System\Text\Json\Reader\Utf8JsonReader.cs" />
index c56f8d9..2b6960c 100644 (file)
@@ -37,6 +37,8 @@ namespace System.Text.Json
         public const int MaxWriterDepth = 1_000;
         public const int RemoveFlagsBitMask = 0x7FFFFFFF;
 
+        public const int StackallocThreshold = 256;
+
         // In the worst case, an ASCII character represented as a single utf-8 byte could expand 6x when escaped.
         // For example: '+' becomes '\u0043'
         // Escaping surrogate pairs (represented by 3 or 4 utf-8 bytes) would expand to 12 bytes (which is still <= 6x).
@@ -66,6 +68,6 @@ namespace System.Text.Json
         public const int HighSurrogateEndValue = 0xDBFF;
         public const int LowSurrogateStartValue = 0xDC00;
         public const int LowSurrogateEndValue = 0xDFFF;
-        public const int ShiftRightBy10 = 0x400;
+        public const int BitShiftBy10 = 0x400;
     }
 }
diff --git a/src/libraries/System.Text.Json/src/System/Text/Json/JsonHelpers.cs b/src/libraries/System.Text.Json/src/System/Text/Json/JsonHelpers.cs
new file mode 100644 (file)
index 0000000..d8998ee
--- /dev/null
@@ -0,0 +1,50 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System.Runtime.CompilerServices;
+
+namespace System.Text.Json
+{
+    internal static partial class JsonHelpers
+    {
+        /// <summary>
+        /// Returns <see langword="true"/> iff <paramref name="value"/> is a valid Unicode scalar
+        /// value, i.e., is in [ U+0000..U+D7FF ], inclusive; or [ U+E000..U+10FFFF ], inclusive.
+        /// </summary>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static bool IsValidUnicodeScalar(uint value)
+        {
+            // By XORing the incoming value with 0xD800, surrogate code points
+            // are moved to the range [ U+0000..U+07FF ], and all valid scalar
+            // values are clustered into the single range [ U+0800..U+10FFFF ],
+            // which allows performing a single fast range check.
+
+            return IsInRangeInclusive(value ^ 0xD800U, 0x800U, 0x10FFFFU);
+        }
+
+        /// <summary>
+        /// Returns <see langword="true"/> iff <paramref name="value"/> is between
+        /// <paramref name="lowerBound"/> and <paramref name="upperBound"/>, inclusive.
+        /// </summary>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static bool IsInRangeInclusive(uint value, uint lowerBound, uint upperBound)
+            => (value - lowerBound) <= (upperBound - lowerBound);
+
+        /// <summary>
+        /// Returns <see langword="true"/> iff <paramref name="value"/> is between
+        /// <paramref name="lowerBound"/> and <paramref name="upperBound"/>, inclusive.
+        /// </summary>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static bool IsInRangeInclusive(byte value, byte lowerBound, byte upperBound)
+            => ((byte)(value - lowerBound) <= (byte)(upperBound - lowerBound));
+
+        /// <summary>
+        /// Returns <see langword="true"/> iff <paramref name="value"/> is between
+        /// <paramref name="lowerBound"/> and <paramref name="upperBound"/>, inclusive.
+        /// </summary>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static bool IsInRangeInclusive(int value, int lowerBound, int upperBound)
+            => (uint)(value - lowerBound) <= (uint)(upperBound - lowerBound);
+    }
+}
diff --git a/src/libraries/System.Text.Json/src/System/Text/Json/Reader/JsonReaderHelper.Unescaping.cs b/src/libraries/System.Text.Json/src/System/Text/Json/Reader/JsonReaderHelper.Unescaping.cs
new file mode 100644 (file)
index 0000000..8ecdb54
--- /dev/null
@@ -0,0 +1,233 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System.Buffers;
+using System.Buffers.Text;
+using System.Diagnostics;
+
+namespace System.Text.Json
+{
+    internal static partial class JsonReaderHelper
+    {
+        // Reject any invalid UTF-8 data rather than silently replacing.
+        public static readonly UTF8Encoding s_utf8Encoding = new UTF8Encoding(encoderShouldEmitUTF8Identifier: false, throwOnInvalidBytes: true);
+
+        // TODO: Similar to escaping, replace the unescaping logic with publicly shipping APIs from https://github.com/dotnet/corefx/issues/33509
+        public static string GetUnescapedString(ReadOnlySpan<byte> utf8Source, int idx)
+        {
+            byte[] unescapedArray = null;
+
+            Span<byte> utf8Unescaped = utf8Source.Length <= JsonConstants.StackallocThreshold ?
+                stackalloc byte[utf8Source.Length] :
+                (unescapedArray = ArrayPool<byte>.Shared.Rent(utf8Source.Length));
+
+            Unescape(utf8Source, utf8Unescaped, idx, out int written);
+            Debug.Assert(written > 0);
+
+            utf8Unescaped = utf8Unescaped.Slice(0, written);
+            Debug.Assert(!utf8Unescaped.IsEmpty);
+
+            string utf8String = TranscodeHelper(utf8Unescaped);
+
+            if (unescapedArray != null)
+            {
+                utf8Unescaped.Clear();
+                ArrayPool<byte>.Shared.Return(unescapedArray);
+            }
+
+            return utf8String;
+        }
+
+        public static string TranscodeHelper(ReadOnlySpan<byte> utf8Unescaped)
+        {
+            try
+            {
+#if BUILDING_INBOX_LIBRARY
+                return s_utf8Encoding.GetString(utf8Unescaped);
+#else
+                if (utf8Unescaped.IsEmpty)
+                {
+                    return string.Empty;
+                }
+                unsafe
+                {
+                    fixed (byte* bytePtr = utf8Unescaped)
+                    {
+                        return s_utf8Encoding.GetString(bytePtr, utf8Unescaped.Length);
+                    }
+                }
+#endif
+            }
+            catch (DecoderFallbackException ex)
+            {
+                // We want to be consistent with the exception being thrown
+                // so the user only has to catch a single exception.
+                // Since we already throw InvalidOperationException for mismatch token type,
+                // and while unescaping, using that exception for failure to decode invalid UTF-8 bytes as well.
+                // Therefore, wrapping the DecoderFallbackException around an InvalidOperationException.
+                throw ThrowHelper.GetInvalidOperationException_ReadInvalidUTF8(ex);
+            }
+        }
+
+        private static void Unescape(ReadOnlySpan<byte> source, Span<byte> destination, int idx, out int written)
+        {
+            Debug.Assert(idx >= 0 && idx < source.Length);
+            Debug.Assert(source[idx] == JsonConstants.BackSlash);
+            Debug.Assert(destination.Length >= source.Length);
+
+            source.Slice(0, idx).CopyTo(destination);
+            written = idx;
+
+            for (; idx < source.Length; idx++)
+            {
+                byte currentByte = source[idx];
+                if (currentByte == JsonConstants.BackSlash)
+                {
+                    idx++;
+                    currentByte = source[idx];
+
+                    if (currentByte == JsonConstants.Quote)
+                    {
+                        destination[written++] = JsonConstants.Quote;
+                    }
+                    else if (currentByte == 'n')
+                    {
+                        destination[written++] = JsonConstants.LineFeed;
+                    }
+                    else if (currentByte == 'r')
+                    {
+                        destination[written++] = JsonConstants.CarriageReturn;
+                    }
+                    else if (currentByte == JsonConstants.BackSlash)
+                    {
+                        destination[written++] = JsonConstants.BackSlash;
+                    }
+                    else if (currentByte == JsonConstants.Slash)
+                    {
+                        destination[written++] = JsonConstants.Slash;
+                    }
+                    else if (currentByte == 't')
+                    {
+                        destination[written++] = JsonConstants.Tab;
+                    }
+                    else if (currentByte == 'b')
+                    {
+                        destination[written++] = JsonConstants.BackSpace;
+                    }
+                    else if (currentByte == 'f')
+                    {
+                        destination[written++] = JsonConstants.FormFeed;
+                    }
+                    else if (currentByte == 'u')
+                    {
+                        // The source is known to be valid JSON, and hence if we see a \u, it is guaranteed to have 4 hex digits following it
+                        // Otherwise, the Utf8JsonReader would have alreayd thrown an exception.
+                        Debug.Assert(source.Length >= idx + 5);
+
+                        bool result = Utf8Parser.TryParse(source.Slice(idx + 1, 4), out int scalar, out int bytesConsumed, 'x');
+                        Debug.Assert(result);
+                        Debug.Assert(bytesConsumed == 4);
+                        idx += bytesConsumed;     // The loop iteration will increment idx past the last hex digit
+
+                        if (JsonHelpers.IsInRangeInclusive((uint)scalar, JsonConstants.HighSurrogateStartValue, JsonConstants.LowSurrogateEndValue))
+                        {
+                            // The first hex value cannot be a low surrogate.
+                            if (scalar >= JsonConstants.LowSurrogateStartValue)
+                            {
+                                ThrowHelper.ThrowInvalidOperationException_ReadInvalidUTF16(scalar);
+                            }
+
+                            Debug.Assert(JsonHelpers.IsInRangeInclusive((uint)scalar, JsonConstants.HighSurrogateStartValue, JsonConstants.HighSurrogateEndValue));
+
+                            idx += 3;   // Skip the last hex digit and the next \u
+
+                            // We must have a low surrogate following a high surrogate.
+                            if (source.Length < idx + 4 || source[idx - 2] != '\\' || source[idx - 1] != 'u')
+                            {
+                                ThrowHelper.ThrowInvalidOperationException_ReadInvalidUTF16();
+                            }
+
+                            // The source is known to be valid JSON, and hence if we see a \u, it is guaranteed to have 4 hex digits following it
+                            // Otherwise, the Utf8JsonReader would have alreayd thrown an exception.
+                            result = Utf8Parser.TryParse(source.Slice(idx, 4), out int lowSurrogate, out bytesConsumed, 'x');
+                            Debug.Assert(result);
+                            Debug.Assert(bytesConsumed == 4);
+
+                            // If the first hex value is a high surrogate, the next one must be a low surrogate.
+                            if (!JsonHelpers.IsInRangeInclusive((uint)lowSurrogate, JsonConstants.LowSurrogateStartValue, JsonConstants.LowSurrogateEndValue))
+                            {
+                                ThrowHelper.ThrowInvalidOperationException_ReadInvalidUTF16(lowSurrogate);
+                            }
+
+                            idx += bytesConsumed - 1;  // The loop iteration will increment idx past the last hex digit
+
+                            // To find the unicode scalar:
+                            // (0x400 * (High surrogate - 0xD800)) + Low surrogate - 0xDC00 + 0x10000
+                            scalar = (JsonConstants.BitShiftBy10 * (scalar - JsonConstants.HighSurrogateStartValue))
+                                + (lowSurrogate - JsonConstants.LowSurrogateStartValue)
+                                + JsonConstants.UnicodePlane01StartValue;
+                        }
+
+#if BUILDING_INBOX_LIBRARY
+                        var rune = new Rune(scalar);
+                        result = rune.TryEncodeToUtf8Bytes(destination.Slice(written), out int bytesWritten);
+                        Debug.Assert(result);
+#else
+                        EncodeToUtf8Bytes((uint)scalar, destination.Slice(written), out int bytesWritten);
+#endif
+                        Debug.Assert(bytesWritten <= 4);
+                        written += bytesWritten;
+                    }
+                }
+                else
+                {
+                    destination[written++] = currentByte;
+                }
+            }
+        }
+
+#if !BUILDING_INBOX_LIBRARY
+        /// <summary>
+        /// Copies the UTF-8 code unit representation of this scalar to an output buffer.
+        /// The buffer must be large enough to hold the required number of <see cref="byte"/>s.
+        /// </summary>
+        private static void EncodeToUtf8Bytes(uint scalar, Span<byte> utf8Destination, out int bytesWritten)
+        {
+            Debug.Assert(JsonHelpers.IsValidUnicodeScalar(scalar));
+            Debug.Assert(utf8Destination.Length >= 4);
+
+            if (scalar < 0x80U)
+            {
+                // Single UTF-8 code unit
+                utf8Destination[0] = (byte)scalar;
+                bytesWritten = 1;
+            }
+            else if (scalar < 0x800U)
+            {
+                // Two UTF-8 code units
+                utf8Destination[0] = (byte)(0xC0U | (scalar >> 6));
+                utf8Destination[1] = (byte)(0x80U | (scalar & 0x3FU));
+                bytesWritten = 2;
+            }
+            else if (scalar < 0x10000U)
+            {
+                // Three UTF-8 code units
+                utf8Destination[0] = (byte)(0xE0U | (scalar >> 12));
+                utf8Destination[1] = (byte)(0x80U | ((scalar >> 6) & 0x3FU));
+                utf8Destination[2] = (byte)(0x80U | (scalar & 0x3FU));
+                bytesWritten = 3;
+            }
+            else
+            {
+                // Four UTF-8 code units
+                utf8Destination[0] = (byte)(0xF0U | (scalar >> 18));
+                utf8Destination[1] = (byte)(0x80U | ((scalar >> 12) & 0x3FU));
+                utf8Destination[2] = (byte)(0x80U | ((scalar >> 6) & 0x3FU));
+                utf8Destination[3] = (byte)(0x80U | (scalar & 0x3FU));
+                bytesWritten = 4;
+            }
+        }
+#endif
+    }
+}
index 56d2d4e..66b9263 100644 (file)
@@ -13,7 +13,7 @@ using Internal.Runtime.CompilerServices;
 
 namespace System.Text.Json
 {
-    internal static class JsonReaderHelper
+    internal static partial class JsonReaderHelper
     {
         public static (int, int) CountNewLines(ReadOnlySpan<byte> data)
         {
index 066757c..f593e1f 100644 (file)
@@ -10,19 +10,14 @@ namespace System.Text.Json
 {
     public ref partial struct Utf8JsonReader
     {
-        // Reject any invalid UTF-8 data rather than silently replacing.
-        private static readonly UTF8Encoding s_utf8Encoding = new UTF8Encoding(encoderShouldEmitUTF8Identifier: false, throwOnInvalidBytes: true);
-
         /// <summary>
-        /// Reads the next JSON token value from the source transcoded as a <see cref="string"/>.
+        /// Reads the next JSON token value from the source, unescaped, and transcoded as a <see cref="string"/>.
         /// </summary>
         /// <exception cref="InvalidOperationException">
         /// Thrown if trying to get the value of the JSON token that is not a string
         /// (i.e. other than <see cref="JsonTokenType.String"/> or <see cref="JsonTokenType.PropertyName"/>).
         /// <seealso cref="TokenType" />
-        /// </exception>
-        /// <exception cref="ArgumentException">
-        /// Thrown if invalid UTF-8 byte sequences are detected while transcoding.
+        /// I will also throw when the JSON string contains invalid UTF-8 bytes, or invalid UTF-16 surrogates.
         /// </exception>
         public string GetStringValue()
         {
@@ -33,23 +28,12 @@ namespace System.Text.Json
 
             ReadOnlySpan<byte> span = HasValueSequence ? ValueSequence.ToArray() : ValueSpan;
 
-#if BUILDING_INBOX_LIBRARY
-            // TODO: https://github.com/dotnet/corefx/issues/33292
-            return s_utf8Encoding.GetString(span);
-#else
-            if (span.IsEmpty)
-            {
-                return string.Empty;
-            }
-            unsafe
+            int idx = span.IndexOf(JsonConstants.BackSlash);
+            if (idx != -1)
             {
-                fixed (byte* bytePtr = span)
-                {
-                    // TODO: https://github.com/dotnet/corefx/issues/33292
-                    return s_utf8Encoding.GetString(bytePtr, span.Length);
-                }
+                return JsonReaderHelper.GetUnescapedString(span, idx);
             }
-#endif
+            return JsonReaderHelper.TranscodeHelper(span);
         }
 
         /// <summary>
index 81f1776..10a094d 100644 (file)
@@ -340,6 +340,21 @@ namespace System.Text.Json
             throw new ArgumentException(SR.Format(SR.CannotWriteInvalidUTF16, $"0x{charAsInt:X2}"));
         }
 
+        public static void ThrowInvalidOperationException_ReadInvalidUTF16(int charAsInt)
+        {
+            throw new InvalidOperationException(SR.Format(SR.CannotReadInvalidUTF16, $"0x{charAsInt:X2}"));
+        }
+
+        public static void ThrowInvalidOperationException_ReadInvalidUTF16()
+        {
+            throw new InvalidOperationException(SR.CannotReadIncompleteUTF16);
+        }
+
+        public static InvalidOperationException GetInvalidOperationException_ReadInvalidUTF8(DecoderFallbackException innerException)
+        {
+            return new InvalidOperationException(SR.CannotTranscodeInvalidUtf8, innerException);
+        }
+
         [MethodImpl(MethodImplOptions.NoInlining)]
         public static InvalidOperationException GetInvalidOperationException(ExceptionResource resource, int currentDepth, byte token, JsonTokenType tokenType)
         {
index 1537cab..8760493 100644 (file)
@@ -151,7 +151,7 @@ namespace System.Text.Json
                         // Divide by 0x400 to shift right by 10 in order to find the surrogate pairs from the scalar
                         // High surrogate = ((scalar -  0x10000) / 0x400) + D800
                         // Low surrogate = ((scalar -  0x10000) % 0x400) + DC00
-                        int quotient = Math.DivRem(scalar - JsonConstants.UnicodePlane01StartValue, JsonConstants.ShiftRightBy10, out int remainder);
+                        int quotient = Math.DivRem(scalar - JsonConstants.UnicodePlane01StartValue, JsonConstants.BitShiftBy10, out int remainder);
                         int firstChar = quotient + JsonConstants.HighSurrogateStartValue;
                         int nextChar = remainder + JsonConstants.LowSurrogateStartValue;
                         bool result = Utf8Formatter.TryFormat(firstChar, destination.Slice(written), out int bytesWritten, format: s_hexStandardFormat);
@@ -179,22 +179,6 @@ namespace System.Text.Json
         private static bool IsUtf8ContinuationByte(byte value) => (value & 0xC0) == 0x80;
 
         /// <summary>
-        /// Returns <see langword="true"/> iff <paramref name="value"/> is between
-        /// <paramref name="lowerBound"/> and <paramref name="upperBound"/>, inclusive.
-        /// </summary>
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        private static bool IsInRangeInclusive(byte value, byte lowerBound, byte upperBound)
-            => ((byte)(value - lowerBound) <= (byte)(upperBound - lowerBound));
-
-        /// <summary>
-        /// Returns <see langword="true"/> iff <paramref name="value"/> is between
-        /// <paramref name="lowerBound"/> and <paramref name="upperBound"/>, inclusive.
-        /// </summary>
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        private static bool IsInRangeInclusive(uint value, uint lowerBound, uint upperBound)
-            => (value - lowerBound) <= (upperBound - lowerBound);
-
-        /// <summary>
         /// Returns <see langword="true"/> iff the low word of <paramref name="char"/> is a UTF-16 surrogate.
         /// </summary>
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
@@ -218,7 +202,7 @@ namespace System.Text.Json
             // - Multi-byte sequences which are improperly terminated (no continuation byte when one is
             //   expected) are reported as invalid sequences up to and including the last seen continuation byte.
 
-            Debug.Assert(IsValidUnicodeScalar(ReplacementChar));
+            Debug.Assert(JsonHelpers.IsValidUnicodeScalar(ReplacementChar));
             rune = ReplacementChar;
 
             if (data.IsEmpty)
@@ -233,13 +217,13 @@ namespace System.Text.Json
             if (IsAsciiValue(firstByte))
             {
                 // ASCII byte = well-formed one-byte sequence.
-                Debug.Assert(IsValidUnicodeScalar(firstByte));
+                Debug.Assert(JsonHelpers.IsValidUnicodeScalar(firstByte));
                 rune = firstByte;
                 numBytesConsumed = 1;
                 return SequenceValidity.WellFormed;
             }
 
-            if (!IsInRangeInclusive(firstByte, (byte)0xC2U, (byte)0xF4U))
+            if (!JsonHelpers.IsInRangeInclusive(firstByte, (byte)0xC2U, (byte)0xF4U))
             {
                 // Standalone continuation byte or "always invalid" byte = ill-formed one-byte sequence.
                 goto InvalidOneByteSequence;
@@ -266,7 +250,7 @@ namespace System.Text.Json
             {
                 // Well-formed two-byte sequence.
                 uint scalar = (((uint)firstByte & 0x1FU) << 6) | ((uint)secondByte & 0x3FU);
-                Debug.Assert(IsValidUnicodeScalar(scalar));
+                Debug.Assert(JsonHelpers.IsValidUnicodeScalar(scalar));
                 rune = (int)scalar;
                 numBytesConsumed = 2;
                 return SequenceValidity.WellFormed;
@@ -297,7 +281,7 @@ namespace System.Text.Json
                     {
                         // Well-formed three-byte sequence.
                         scalar |= (uint)thirdByte & 0x3FU;
-                        Debug.Assert(IsValidUnicodeScalar(scalar));
+                        Debug.Assert(JsonHelpers.IsValidUnicodeScalar(scalar));
                         rune = (int)scalar;
                         numBytesConsumed = 3;
                         return SequenceValidity.WellFormed;
@@ -315,8 +299,8 @@ namespace System.Text.Json
                 // Need to check for overlong or out-of-range sequences.
 
                 uint scalar = (((uint)firstByte & 0x07U) << 18) | (((uint)secondByte & 0x3FU) << 12);
-                Debug.Assert(IsValidUnicodeScalar(scalar));
-                if (!IsInRangeInclusive(scalar, 0x10000U, 0x10FFFFU))
+                Debug.Assert(JsonHelpers.IsValidUnicodeScalar(scalar));
+                if (!JsonHelpers.IsInRangeInclusive(scalar, 0x10000U, 0x10FFFFU))
                 {
                     goto OverlongOutOfRangeOrSurrogateSequence;
                 }
@@ -347,7 +331,7 @@ namespace System.Text.Json
                             {
                                 // Well-formed four-byte sequence.
                                 scalar |= (((uint)thirdByte & 0x3FU) << 6) | ((uint)fourthByte & 0x3FU);
-                                Debug.Assert(IsValidUnicodeScalar(scalar));
+                                Debug.Assert(JsonHelpers.IsValidUnicodeScalar(scalar));
                                 rune = (int)scalar;
                                 numBytesConsumed = 4;
                                 return SequenceValidity.WellFormed;
@@ -421,7 +405,7 @@ namespace System.Text.Json
         private static void EscapeNextChars(ReadOnlySpan<char> value, int firstChar, Span<char> destination, ref int consumed, ref int written)
         {
             int nextChar = -1;
-            if (IsInRangeInclusive(firstChar, JsonConstants.HighSurrogateStartValue, JsonConstants.LowSurrogateEndValue))
+            if (JsonHelpers.IsInRangeInclusive(firstChar, JsonConstants.HighSurrogateStartValue, JsonConstants.LowSurrogateEndValue))
             {
                 consumed++;
                 if (value.Length <= consumed || firstChar >= JsonConstants.LowSurrogateStartValue)
@@ -430,7 +414,7 @@ namespace System.Text.Json
                 }
 
                 nextChar = value[consumed];
-                if (!IsInRangeInclusive(nextChar, JsonConstants.LowSurrogateStartValue, JsonConstants.LowSurrogateEndValue))
+                if (!JsonHelpers.IsInRangeInclusive(nextChar, JsonConstants.LowSurrogateStartValue, JsonConstants.LowSurrogateEndValue))
                 {
                     ThrowHelper.ThrowArgumentException_InvalidUTF16(nextChar);
                 }
@@ -482,32 +466,11 @@ namespace System.Text.Json
             }
         }
 
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        private static bool IsInRangeInclusive(int ch, int start, int end)
-        {
-            return (uint)(ch - start) <= (uint)(end - start);
-        }
-
         /// <summary>
         /// A scalar that represents the Unicode replacement character U+FFFD.
         /// </summary>
         private const int ReplacementChar = 0xFFFD;
 
-        /// <summary>
-        /// Returns <see langword="true"/> iff <paramref name="value"/> is a valid Unicode scalar
-        /// value, i.e., is in [ U+0000..U+D7FF ], inclusive; or [ U+E000..U+10FFFF ], inclusive.
-        /// </summary>
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        private static bool IsValidUnicodeScalar(uint value)
-        {
-            // By XORing the incoming value with 0xD800, surrogate code points
-            // are moved to the range [ U+0000..U+07FF ], and all valid scalar
-            // values are clustered into the single range [ U+0800..U+10FFFF ],
-            // which allows performing a single fast range check.
-
-            return IsInRangeInclusive(value ^ 0xD800U, 0x800U, 0x10FFFFU);
-        }
-
 #if !BUILDING_INBOX_LIBRARY
         private static int WriteHex(int value, Span<char> destination, int written)
         {
index 9285ee0..b533eeb 100644 (file)
@@ -151,7 +151,7 @@ namespace System.Text.Json
                         else
                         {
                             // if (!IsLowSurrogate(ch) && !IsHighSurrogate(ch))
-                            if (!IsInRangeInclusive(ch, JsonConstants.HighSurrogateStart, JsonConstants.LowSurrogateEnd))
+                            if (!JsonHelpers.IsInRangeInclusive(ch, JsonConstants.HighSurrogateStart, JsonConstants.LowSurrogateEnd))
                             {
                                 // 3 byte encoding
                                 chd = unchecked((sbyte)0xE0) | (ch >> 12);
@@ -169,7 +169,7 @@ namespace System.Text.Json
                                 chd = *pSrc;
 
                                 // if (!IsLowSurrogate(chd)) {
-                                if (!IsInRangeInclusive(chd, JsonConstants.LowSurrogateStart, JsonConstants.LowSurrogateEnd))
+                                if (!JsonHelpers.IsInRangeInclusive(chd, JsonConstants.LowSurrogateStart, JsonConstants.LowSurrogateEnd))
                                 {
                                     // high not followed by low -> bad
                                     goto InvalidData;
@@ -240,7 +240,7 @@ namespace System.Text.Json
                     else
                     {
                         // if (!IsLowSurrogate(ch) && !IsHighSurrogate(ch))
-                        if (!IsInRangeInclusive(ch, JsonConstants.HighSurrogateStart, JsonConstants.LowSurrogateEnd))
+                        if (!JsonHelpers.IsInRangeInclusive(ch, JsonConstants.HighSurrogateStart, JsonConstants.LowSurrogateEnd))
                         {
                             if (pAllocatedBufferEnd - pTarget <= 2)
                                 goto DestinationFull;
@@ -267,7 +267,7 @@ namespace System.Text.Json
                             chd = *pSrc;
 
                             // if (!IsLowSurrogate(chd)) {
-                            if (!IsInRangeInclusive(chd, JsonConstants.LowSurrogateStart, JsonConstants.LowSurrogateEnd))
+                            if (!JsonHelpers.IsInRangeInclusive(chd, JsonConstants.LowSurrogateStart, JsonConstants.LowSurrogateEnd))
                             {
                                 // high not followed by low -> bad
                                 goto InvalidData;
index 6830380..d440867 100644 (file)
@@ -4,6 +4,8 @@
 
 using System.Collections.Generic;
 using System.Globalization;
+using System.IO;
+using Newtonsoft.Json;
 using Xunit;
 
 namespace System.Text.Json.Tests
@@ -351,5 +353,148 @@ namespace System.Text.Json.Tests
             Assert.Equal(dataUtf8.Length, json.BytesConsumed);
             Assert.Equal(json.BytesConsumed, json.CurrentState.BytesConsumed);
         }
+
+        [Theory]
+        [InlineData("{\"message\":\"Hello, I am \\\"Ahson!\\\"\"}")]
+        [InlineData("{\"nam\\\"e\":\"ah\\\"son\"}")]
+        [InlineData("{\"Here is a string: \\\"\\\"\":\"Here is a\",\"Here is a back slash\\\\\":[\"Multiline\\r\\n String\\r\\n\",\"\\tMul\\r\\ntiline String\",\"\\\"somequote\\\"\\tMu\\\"\\\"l\\r\\ntiline\\\"another\\\" String\\\\\"],\"str\":\"\\\"\\\"\"}")]
+        [InlineData("[\"\\u0030\\u0031\\u0032\\u0033\\u0034\\u0035\", \"\\u0000\\u002B\", \"a\\u005C\\u0072b\", \"a\\\\u005C\\u0072b\", \"a\\u008E\\u008Fb\", \"a\\uD803\\uDE6Db\", \"a\\uD834\\uDD1Eb\", \"a\\\\uD834\\\\uDD1Eb\"]")]
+        [InlineData("{\"message\":\"Hello /a/b/c \\/ \\r\\b\\n\\f\\t\\/\"}")]
+        [InlineData(null)]  // Large randomly generated string
+        public static void TestingGetString(string jsonString)
+        {
+            if (jsonString == null)
+            {
+                var random = new Random(42);
+                var charArray = new char[500];
+                charArray[0] = '"';
+                for (int i = 1; i < charArray.Length; i++)
+                {
+                    charArray[i] = (char)random.Next('?', '\\'); // ASCII values (between 63 and 91) that don't need to be escaped.
+                }
+
+                charArray[256] = '\\';
+                charArray[257] = '"';
+                charArray[charArray.Length - 1] = '"';
+                jsonString = new string(charArray);
+            }
+
+            var expectedPropertyNames = new List<string>();
+            var expectedValues = new List<string>();
+
+            var jsonNewtonsoft = new JsonTextReader(new StringReader(jsonString));
+            while (jsonNewtonsoft.Read())
+            {
+                if (jsonNewtonsoft.TokenType == JsonToken.String)
+                {
+                    expectedValues.Add(jsonNewtonsoft.Value.ToString());
+                }
+                else if (jsonNewtonsoft.TokenType == JsonToken.PropertyName)
+                {
+                    expectedPropertyNames.Add(jsonNewtonsoft.Value.ToString());
+                }
+            }
+
+            byte[] dataUtf8 = Encoding.UTF8.GetBytes(jsonString);
+
+            var actualPropertyNames = new List<string>();
+            var actualValues = new List<string>();
+
+            var json = new Utf8JsonReader(dataUtf8, true, default);
+            while (json.Read())
+            {
+                if (json.TokenType == JsonTokenType.String)
+                {
+                    actualValues.Add(json.GetStringValue());
+                }
+                else if (json.TokenType == JsonTokenType.PropertyName)
+                {
+                    actualPropertyNames.Add(json.GetStringValue());
+                }
+            }
+
+            Assert.Equal(expectedPropertyNames.Count, actualPropertyNames.Count);
+            for (int i = 0; i < expectedPropertyNames.Count; i++)
+            {
+                Assert.Equal(expectedPropertyNames[i], actualPropertyNames[i]);
+            }
+
+            Assert.Equal(expectedValues.Count, actualValues.Count);
+            for (int i = 0; i < expectedValues.Count; i++)
+            {
+                Assert.Equal(expectedValues[i], actualValues[i]);
+            }
+
+            Assert.Equal(dataUtf8.Length, json.BytesConsumed);
+            Assert.Equal(json.BytesConsumed, json.CurrentState.BytesConsumed);
+        }
+
+        [Theory]
+        [InlineData("\"a\\uDD1E\"")]
+        [InlineData("\"a\\uDD1Eb\"")]
+        [InlineData("\"a\\uD834\"")]
+        [InlineData("\"a\\uD834\\u0030\"")]
+        [InlineData("\"a\\uD834\\uD834\"")]
+        [InlineData("\"a\\uD834b\"")]
+        [InlineData("\"a\\uDD1E\\uD834b\"")]
+        [InlineData("\"a\\\\uD834\\uDD1Eb\"")]
+        [InlineData("\"a\\uDD1E\\\\uD834b\"")]
+        public static void TestingGetStringInvalidUTF16(string jsonString)
+        {
+            byte[] dataUtf8 = Encoding.UTF8.GetBytes(jsonString);
+
+            foreach (JsonCommentHandling commentHandling in Enum.GetValues(typeof(JsonCommentHandling)))
+            {
+                var state = new JsonReaderState(options: new JsonReaderOptions { CommentHandling = commentHandling });
+                var json = new Utf8JsonReader(dataUtf8, isFinalBlock: true, state);
+
+                Assert.True(json.Read());
+                Assert.Equal(JsonTokenType.String, json.TokenType);
+                try
+                {
+                    string val = json.GetStringValue();
+                    Assert.True(false, "Expected InvalidOperationException when trying to get string value for invalid UTF-16 JSON text.");
+                }
+                catch (InvalidOperationException) { }
+            }
+        }
+
+
+
+        [Theory]
+        [MemberData(nameof(InvalidUTF8Strings))]
+        public static void TestingGetStringInvalidUTF8(byte[] dataUtf8)
+        {
+            foreach (JsonCommentHandling commentHandling in Enum.GetValues(typeof(JsonCommentHandling)))
+            {
+                var state = new JsonReaderState(options: new JsonReaderOptions { CommentHandling = commentHandling });
+                var json = new Utf8JsonReader(dataUtf8, isFinalBlock: true, state);
+
+                // It is expected that the Utf8JsonReader won't throw an exception here
+                Assert.True(json.Read());
+                Assert.Equal(JsonTokenType.String, json.TokenType);
+
+                while (json.Read())
+                    ;
+
+                json = new Utf8JsonReader(dataUtf8, isFinalBlock: true, state);
+
+                while (json.Read())
+                {
+                    if (json.TokenType == JsonTokenType.String)
+                    {
+                        try
+                        {
+                            string val = json.GetStringValue();
+                            Assert.True(false, "Expected InvalidOperationException when trying to get string value for invalid UTF-8 JSON text.");
+                        }
+                        catch (InvalidOperationException ex)
+                        {
+                            Assert.Equal(ex.InnerException.GetType(), typeof(DecoderFallbackException));
+                        }
+                    }
+                }
+            }
+        }
     }
 }
index b4d0d51..b9514dc 100644 (file)
@@ -561,38 +561,34 @@ namespace System.Text.Json.Tests
         }
 
         [Theory]
-        [InlineData("{\"nam\\\"e\":\"ah\\\"son\"}", JsonCommentHandling.Disallow, "nam\\\"e, ah\\\"son, ")]
+        [InlineData("{\"nam\\\"e\":\"ah\\\"son\"}", "nam\\\"e, ah\\\"son, ", "nam\"e, ah\"son, ")]
         [InlineData("{\"Here is a string: \\\"\\\"\":\"Here is a\",\"Here is a back slash\\\\\":[\"Multiline\\r\\n String\\r\\n\",\"\\tMul\\r\\ntiline String\",\"\\\"somequote\\\"\\tMu\\\"\\\"l\\r\\ntiline\\\"another\\\" String\\\\\"],\"str\":\"\\\"\\\"\"}",
-            JsonCommentHandling.Disallow,
-            "Here is a string: \\\"\\\", Here is a, Here is a back slash\\\\, Multiline\\r\\n String\\r\\n, \\tMul\\r\\ntiline String, \\\"somequote\\\"\\tMu\\\"\\\"l\\r\\ntiline\\\"another\\\" String\\\\, str, \\\"\\\", ")]
-
-        [InlineData("{\"nam\\\"e\":\"ah\\\"son\"}", JsonCommentHandling.Allow, "nam\\\"e, ah\\\"son, ")]
-        [InlineData("{\"Here is a string: \\\"\\\"\":\"Here is a\",\"Here is a back slash\\\\\":[\"Multiline\\r\\n String\\r\\n\",\"\\tMul\\r\\ntiline String\",\"\\\"somequote\\\"\\tMu\\\"\\\"l\\r\\ntiline\\\"another\\\" String\\\\\"],\"str\":\"\\\"\\\"\"}",
-            JsonCommentHandling.Allow,
-            "Here is a string: \\\"\\\", Here is a, Here is a back slash\\\\, Multiline\\r\\n String\\r\\n, \\tMul\\r\\ntiline String, \\\"somequote\\\"\\tMu\\\"\\\"l\\r\\ntiline\\\"another\\\" String\\\\, str, \\\"\\\", ")]
-
-        [InlineData("{\"nam\\\"e\":\"ah\\\"son\"}", JsonCommentHandling.Skip, "nam\\\"e, ah\\\"son, ")]
-        [InlineData("{\"Here is a string: \\\"\\\"\":\"Here is a\",\"Here is a back slash\\\\\":[\"Multiline\\r\\n String\\r\\n\",\"\\tMul\\r\\ntiline String\",\"\\\"somequote\\\"\\tMu\\\"\\\"l\\r\\ntiline\\\"another\\\" String\\\\\"],\"str\":\"\\\"\\\"\"}",
-            JsonCommentHandling.Skip,
-            "Here is a string: \\\"\\\", Here is a, Here is a back slash\\\\, Multiline\\r\\n String\\r\\n, \\tMul\\r\\ntiline String, \\\"somequote\\\"\\tMu\\\"\\\"l\\r\\ntiline\\\"another\\\" String\\\\, str, \\\"\\\", ")]
-        public static void TestJsonReaderUtf8SpecialString(string jsonString, JsonCommentHandling commentHandling, string expectedStr)
+            "Here is a string: \\\"\\\", Here is a, Here is a back slash\\\\, Multiline\\r\\n String\\r\\n, \\tMul\\r\\ntiline String, \\\"somequote\\\"\\tMu\\\"\\\"l\\r\\ntiline\\\"another\\\" String\\\\, str, \\\"\\\", ",
+            "Here is a string: \"\", Here is a, Here is a back slash\\, Multiline\r\n String\r\n, \tMul\r\ntiline String, \"somequote\"\tMu\"\"l\r\ntiline\"another\" String\\, str, \"\", ")]
+        public static void TestJsonReaderUtf8SpecialString(string jsonString, string expectedStr, string expectedEscapedStr)
         {
-            byte[] dataUtf8 = Encoding.UTF8.GetBytes(jsonString);
-            byte[] result = JsonTestHelper.ReturnBytesHelper(dataUtf8, out int length, commentHandling);
-            string actualStr = Encoding.UTF8.GetString(result, 0, length);
+            foreach (JsonCommentHandling commentHandling in Enum.GetValues(typeof(JsonCommentHandling)))
+            {
+                byte[] dataUtf8 = Encoding.UTF8.GetBytes(jsonString);
+                byte[] result = JsonTestHelper.ReturnBytesHelper(dataUtf8, out int length, commentHandling);
+                string actualStr = Encoding.UTF8.GetString(result, 0, length);
 
-            Assert.Equal(expectedStr, actualStr);
+                Assert.Equal(expectedStr, actualStr);
 
-            result = JsonTestHelper.SequenceReturnBytesHelper(dataUtf8, out length, commentHandling);
-            actualStr = Encoding.UTF8.GetString(result, 0, length);
+                result = JsonTestHelper.SequenceReturnBytesHelper(dataUtf8, out length, commentHandling);
+                actualStr = Encoding.UTF8.GetString(result, 0, length);
 
-            Assert.Equal(expectedStr, actualStr);
+                Assert.Equal(expectedStr, actualStr);
 
-            object jsonValues = JsonTestHelper.ReturnObjectHelper(dataUtf8, commentHandling);
-            string str = JsonTestHelper.ObjectToString(jsonValues);
-            ReadOnlySpan<char> expectedSpan = expectedStr.AsSpan(0, expectedStr.Length - 2);
-            ReadOnlySpan<char> actualSpan = str.AsSpan(0, str.Length - 2);
-            Assert.True(expectedSpan.SequenceEqual(actualSpan));
+                object jsonValues = JsonTestHelper.ReturnObjectHelper(dataUtf8, commentHandling);
+                string str = JsonTestHelper.ObjectToString(jsonValues);
+                Assert.Equal(expectedEscapedStr, str);
+
+                Stream stream = new MemoryStream(dataUtf8);
+                TextReader reader = new StreamReader(stream, Encoding.UTF8, false, 1024, true);
+                expectedEscapedStr = JsonTestHelper.NewtonsoftReturnStringHelper(reader);
+                Assert.Equal(expectedEscapedStr, str);
+            }
         }
 
         [Theory]
@@ -1775,14 +1771,14 @@ namespace System.Text.Json.Tests
                     new object[] {"   true  ", true, "True"},
                     new object[] {"   false   ", true, "False"},
                     new object[] {"   null   ", true, "null"},
-                    new object[] {"   \" Test string with \\\"nested quotes \\\" and hex: \\uABCD values! \"   ", true, " Test string with \\\"nested quotes \\\" and hex: \\uABCD values! "},
+                    new object[] {"   \" Test string with \\\"nested quotes \\\" and hex: \\uABCD values! \"   ", true, " Test string with \"nested quotes \" and hex: \uABCD values! "},
 
                     new object[] {"   12345   ", false, "12345"},
                     new object[] {"   12345.67890e-12   ", false, "1.23456789E-08"},
                     new object[] {"   true  ", false, "True"},
                     new object[] {"   false   ", false, "False"},
                     new object[] {"   null   ", false, "null"},
-                    new object[] {"   \" Test string with \\\"nested quotes \\\" and hex: \\uABCD values! \"   ", false, " Test string with \\\"nested quotes \\\" and hex: \\uABCD values! "},
+                    new object[] {"   \" Test string with \\\"nested quotes \\\" and hex: \\uABCD values! \"   ", false, " Test string with \"nested quotes \" and hex: \uABCD values! "},
                 };
             }
         }
@@ -1879,5 +1875,22 @@ namespace System.Text.Json.Tests
                 };
             }
         }
+
+        public static IEnumerable<object[]> InvalidUTF8Strings
+        {
+            get
+            {
+                return new List<object[]>
+                {
+                    new object[] { new byte[] { 34, 97, 0xc3, 0x28, 98, 34 } },
+                    new object[] { new byte[] { 34, 97, 0xa0, 0xa1, 98, 34 } },
+                    new object[] { new byte[] { 34, 97, 0xe2, 0x28, 0xa1, 98, 34 } },
+                    new object[] { new byte[] { 34, 97, 0xe2, 0x82, 0x28, 98, 34 } },
+                    new object[] { new byte[] { 34, 97, 0xf0, 0x28, 0x8c, 0xbc, 98, 34 } },
+                    new object[] { new byte[] { 34, 97, 0xf0, 0x90, 0x28, 0xbc, 98, 34 } },
+                    new object[] { new byte[] { 34, 97, 0xf0, 0x28, 0x8c, 0x28, 98, 34 } },
+                };
+            }
+        }
     }
 }