From: Ahson Khan Date: Tue, 22 Oct 2019 07:59:19 +0000 (-0700) Subject: Use Sse2 instrinsics to make NeedsEscaping check faster for large JSON strings (dotne... X-Git-Tag: submit/tizen/20210909.063632~11031^2~241 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=0089be53b8b475509fd4b39c59358418cb65d8d0;p=platform%2Fupstream%2Fdotnet%2Fruntime.git Use Sse2 instrinsics to make NeedsEscaping check faster for large JSON strings (dotnet/corefx#41845) * Use Sse2 instrinsics to make NeedsEscaping check faster for large strings. * Update the utf-8 bytes needsescaping and add tests. * Remove unnecessary bitwise OR and add more tests * Add more tests around surrogates, invalid strings, and characters > short.MaxValue. Commit migrated from https://github.com/dotnet/corefx/commit/7cae92b39d2fed2e4e88e900f2e7d787ed9f6cfa --- diff --git a/src/libraries/System.Text.Json/src/System.Text.Json.csproj b/src/libraries/System.Text.Json/src/System.Text.Json.csproj index 5eec31c..45cb66c 100644 --- a/src/libraries/System.Text.Json/src/System.Text.Json.csproj +++ b/src/libraries/System.Text.Json/src/System.Text.Json.csproj @@ -189,6 +189,7 @@ + diff --git a/src/libraries/System.Text.Json/src/System/Text/Json/Writer/JsonWriterHelper.Escaping.cs b/src/libraries/System.Text.Json/src/System/Text/Json/Writer/JsonWriterHelper.Escaping.cs index a41035b..5c6c27f 100644 --- a/src/libraries/System.Text.Json/src/System/Text/Json/Writer/JsonWriterHelper.Escaping.cs +++ b/src/libraries/System.Text.Json/src/System/Text/Json/Writer/JsonWriterHelper.Escaping.cs @@ -5,10 +5,16 @@ using System.Buffers; using System.Buffers.Text; using System.Diagnostics; +using System.Numerics; using System.Runtime.CompilerServices; using System.Runtime.InteropServices; using System.Text.Encodings.Web; +#if BUILDING_INBOX_LIBRARY +using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.X86; +#endif + namespace System.Text.Json { // TODO: Replace the escaping logic with publicly shipping APIs from https://github.com/dotnet/corefx/issues/33509 @@ -55,57 +61,202 @@ namespace System.Text.Json [MethodImpl(MethodImplOptions.AggressiveInlining)] private static bool NeedsEscaping(char value) => value > LastAsciiCharacter || AllowList[value] == 0; - public static int NeedsEscaping(ReadOnlySpan value, JavaScriptEncoder encoder) +#if BUILDING_INBOX_LIBRARY + private static readonly Vector128 s_mask_UInt16_0x20 = Vector128.Create((short)0x20); // Space ' ' + + private static readonly Vector128 s_mask_UInt16_0x22 = Vector128.Create((short)0x22); // Quotation Mark '"' + private static readonly Vector128 s_mask_UInt16_0x26 = Vector128.Create((short)0x26); // Ampersand '&' + private static readonly Vector128 s_mask_UInt16_0x27 = Vector128.Create((short)0x27); // Apostrophe ''' + private static readonly Vector128 s_mask_UInt16_0x2B = Vector128.Create((short)0x2B); // Plus sign '+' + private static readonly Vector128 s_mask_UInt16_0x3C = Vector128.Create((short)0x3C); // Less Than Sign '<' + private static readonly Vector128 s_mask_UInt16_0x3E = Vector128.Create((short)0x3E); // Greater Than Sign '>' + private static readonly Vector128 s_mask_UInt16_0x5C = Vector128.Create((short)0x5C); // Reverse Solidus '\' + private static readonly Vector128 s_mask_UInt16_0x60 = Vector128.Create((short)0x60); // Grave Access '`' + + private static readonly Vector128 s_mask_UInt16_0x7E = Vector128.Create((short)0x7E); // Tilde '~' + + private static readonly Vector128 s_mask_SByte_0x20 = Vector128.Create((sbyte)0x20); // Space ' ' + + private static readonly Vector128 s_mask_SByte_0x22 = Vector128.Create((sbyte)0x22); // Quotation Mark '"' + private static readonly Vector128 s_mask_SByte_0x26 = Vector128.Create((sbyte)0x26); // Ampersand '&' + private static readonly Vector128 s_mask_SByte_0x27 = Vector128.Create((sbyte)0x27); // Apostrophe ''' + private static readonly Vector128 s_mask_SByte_0x2B = Vector128.Create((sbyte)0x2B); // Plus sign '+' + private static readonly Vector128 s_mask_SByte_0x3C = Vector128.Create((sbyte)0x3C); // Less Than Sign '<' + private static readonly Vector128 s_mask_SByte_0x3E = Vector128.Create((sbyte)0x3E); // Greater Than Sign '>' + private static readonly Vector128 s_mask_SByte_0x5C = Vector128.Create((sbyte)0x5C); // Reverse Solidus '\' + private static readonly Vector128 s_mask_SByte_0x60 = Vector128.Create((sbyte)0x60); // Grave Access '`' + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static Vector128 CreateEscapingMask(Vector128 sourceValue) { - int idx; + Debug.Assert(Sse2.IsSupported); - if (encoder != null) - { - idx = encoder.FindFirstCharacterToEncodeUtf8(value); - goto Return; - } + Vector128 mask = Sse2.CompareLessThan(sourceValue, s_mask_UInt16_0x20); // Space ' ', anything in the control characters range + + mask = Sse2.Or(mask, Sse2.CompareEqual(sourceValue, s_mask_UInt16_0x22)); // Quotation Mark '"' + mask = Sse2.Or(mask, Sse2.CompareEqual(sourceValue, s_mask_UInt16_0x26)); // Ampersand '&' + mask = Sse2.Or(mask, Sse2.CompareEqual(sourceValue, s_mask_UInt16_0x27)); // Apostrophe ''' + mask = Sse2.Or(mask, Sse2.CompareEqual(sourceValue, s_mask_UInt16_0x2B)); // Plus sign '+' + + mask = Sse2.Or(mask, Sse2.CompareEqual(sourceValue, s_mask_UInt16_0x3C)); // Less Than Sign '<' + mask = Sse2.Or(mask, Sse2.CompareEqual(sourceValue, s_mask_UInt16_0x3E)); // Greater Than Sign '>' + mask = Sse2.Or(mask, Sse2.CompareEqual(sourceValue, s_mask_UInt16_0x5C)); // Reverse Solidus '\' + mask = Sse2.Or(mask, Sse2.CompareEqual(sourceValue, s_mask_UInt16_0x60)); // Grave Access '`' + + mask = Sse2.Or(mask, Sse2.CompareGreaterThan(sourceValue, s_mask_UInt16_0x7E)); // Tilde '~', anything above the ASCII range + + return mask; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static Vector128 CreateEscapingMask(Vector128 sourceValue) + { + Debug.Assert(Sse2.IsSupported); - for (idx = 0; idx < value.Length; idx++) + Vector128 mask = Sse2.CompareLessThan(sourceValue, s_mask_SByte_0x20); // Control characters, and anything above 0x7E since sbyte.MaxValue is 0x7E + + mask = Sse2.Or(mask, Sse2.CompareEqual(sourceValue, s_mask_SByte_0x22)); // Quotation Mark " + mask = Sse2.Or(mask, Sse2.CompareEqual(sourceValue, s_mask_SByte_0x26)); // Ampersand & + mask = Sse2.Or(mask, Sse2.CompareEqual(sourceValue, s_mask_SByte_0x27)); // Apostrophe ' + mask = Sse2.Or(mask, Sse2.CompareEqual(sourceValue, s_mask_SByte_0x2B)); // Plus sign + + + mask = Sse2.Or(mask, Sse2.CompareEqual(sourceValue, s_mask_SByte_0x3C)); // Less Than Sign < + mask = Sse2.Or(mask, Sse2.CompareEqual(sourceValue, s_mask_SByte_0x3E)); // Greater Than Sign > + mask = Sse2.Or(mask, Sse2.CompareEqual(sourceValue, s_mask_SByte_0x5C)); // Reverse Solidus \ + mask = Sse2.Or(mask, Sse2.CompareEqual(sourceValue, s_mask_SByte_0x60)); // Grave Access ` + + return mask; + } +#endif + + public static unsafe int NeedsEscaping(ReadOnlySpan value, JavaScriptEncoder encoder) + { + fixed (byte* ptr = value) { - if (NeedsEscaping(value[idx])) + int idx = 0; + + if (encoder != null) { + idx = encoder.FindFirstCharacterToEncodeUtf8(value); goto Return; } - } - idx = -1; // all characters allowed +#if BUILDING_INBOX_LIBRARY + if (Sse2.IsSupported) + { + sbyte* startingAddress = (sbyte*)ptr; + while (value.Length - 16 >= idx) + { + Debug.Assert(startingAddress >= ptr && startingAddress <= (ptr + value.Length - 16)); + + // Load the next 16 bytes. + Vector128 sourceValue = Sse2.LoadVector128(startingAddress); + + // Check if any of the 16 bytes need to be escaped. + Vector128 mask = CreateEscapingMask(sourceValue); + + int index = Sse2.MoveMask(mask.AsByte()); + // If index == 0, that means none of the 16 bytes needed to be escaped. + // TrailingZeroCount is relatively expensive, avoid it if possible. + if (index != 0) + { + // Found at least one byte that needs to be escaped, figure out the index of + // the first one found that needed to be escaped within the 16 bytes. + Debug.Assert(index > 0 && index <= 65_535); + int tzc = BitOperations.TrailingZeroCount(index); + Debug.Assert(tzc >= 0 && tzc <= 16); + idx += tzc; + goto Return; + } + idx += 16; + startingAddress += 16; + } + + // Process the remaining characters. + Debug.Assert(value.Length - idx < 16); + } +#endif + + for (; idx < value.Length; idx++) + { + Debug.Assert((ptr + idx) <= (ptr + value.Length)); + if (NeedsEscaping(*(ptr + idx))) + { + goto Return; + } + } - Return: - return idx; + idx = -1; // all characters allowed + + Return: + return idx; + } } public static unsafe int NeedsEscaping(ReadOnlySpan value, JavaScriptEncoder encoder) { - int idx; - - // Some implementations of JavascriptEncoder.FindFirstCharacterToEncode may not accept - // null pointers and gaurd against that. Hence, check up-front and fall down to return -1. - if (encoder != null && !value.IsEmpty) + fixed (char* ptr = value) { - fixed (char* ptr = value) + int idx = 0; + + // Some implementations of JavascriptEncoder.FindFirstCharacterToEncode may not accept + // null pointers and gaurd against that. Hence, check up-front and fall down to return -1. + if (encoder != null && !value.IsEmpty) { idx = encoder.FindFirstCharacterToEncode(ptr, value.Length); + goto Return; } - goto Return; - } - for (idx = 0; idx < value.Length; idx++) - { - if (NeedsEscaping(value[idx])) +#if BUILDING_INBOX_LIBRARY + if (Sse2.IsSupported) { - goto Return; + short* startingAddress = (short*)ptr; + while (value.Length - 8 >= idx) + { + Debug.Assert(startingAddress >= ptr && startingAddress <= (ptr + value.Length - 8)); + + // Load the next 8 characters. + Vector128 sourceValue = Sse2.LoadVector128(startingAddress); + + // Check if any of the 8 characters need to be escaped. + Vector128 mask = CreateEscapingMask(sourceValue); + + int index = Sse2.MoveMask(mask.AsByte()); + // If index == 0, that means none of the 8 characters needed to be escaped. + // TrailingZeroCount is relatively expensive, avoid it if possible. + if (index != 0) + { + // Found at least one character that needs to be escaped, figure out the index of + // the first one found that needed to be escaped within the 8 characters. + Debug.Assert(index > 0 && index <= 65_535); + int tzc = BitOperations.TrailingZeroCount(index); + Debug.Assert(tzc % 2 == 0 && tzc >= 0 && tzc <= 16); + idx += tzc >> 1; + goto Return; + } + idx += 8; + startingAddress += 8; + } + + // Process the remaining characters. + Debug.Assert(value.Length - idx < 8); + } +#endif + + for (; idx < value.Length; idx++) + { + Debug.Assert((ptr + idx) <= (ptr + value.Length)); + if (NeedsEscaping(*(ptr + idx))) + { + goto Return; + } } - } - idx = -1; // all characters allowed + idx = -1; // All characters are allowed. - Return: - return idx; + Return: + return idx; + } } public static int GetMaxEscapedLength(int textLength, int firstIndexToEscape) diff --git a/src/libraries/System.Text.Json/tests/Utf8JsonWriterTests.cs b/src/libraries/System.Text.Json/tests/Utf8JsonWriterTests.cs index f8b625f..1456e0b 100644 --- a/src/libraries/System.Text.Json/tests/Utf8JsonWriterTests.cs +++ b/src/libraries/System.Text.Json/tests/Utf8JsonWriterTests.cs @@ -178,6 +178,396 @@ namespace System.Text.Json.Tests JsonTestHelper.AssertContents("\"\u2020\\\"\"", output); } + [Theory] + [MemberData(nameof(EscapingTestData))] + public void EscapingTestWhileWriting(char replacementChar, JavaScriptEncoder encoder, bool requiresEscaping) + { + var writerOptions = new JsonWriterOptions { Encoder = encoder }; + + { + ReadOnlyMemory written = WriteStringHelper(writerOptions, null); + Assert.Equal(-1, written.Span.IndexOf((byte)'\\')); + + written = WriteUtf8StringHelper(writerOptions, null); + Assert.Equal(-1, written.Span.IndexOf((byte)'\\')); + + written = WriteStringHelper(writerOptions, string.Empty); + Assert.Equal(-1, written.Span.IndexOf((byte)'\\')); + + written = WriteUtf8StringHelper(writerOptions, Array.Empty()); + Assert.Equal(-1, written.Span.IndexOf((byte)'\\')); + } + + var random = new Random(42); + for (int dataLength = 0; dataLength < 50; dataLength++) + { + char[] str = new char[dataLength]; + for (int i = 0; i < dataLength; i++) + { + str[i] = (char)random.Next(97, 123); + } + string baseStr = new string(str); + byte[] sourceUtf8 = Encoding.UTF8.GetBytes(baseStr); + + ReadOnlyMemory written = WriteStringHelper(writerOptions, baseStr); + Assert.Equal(-1, written.Span.IndexOf((byte)'\\')); + + written = WriteUtf8StringHelper(writerOptions, sourceUtf8); + Assert.Equal(-1, written.Span.IndexOf((byte)'\\')); + + for (int i = 0; i < dataLength; i++) + { + char[] changed = baseStr.ToCharArray(); + changed[i] = replacementChar; + string newStr = new string(changed); + sourceUtf8 = Encoding.UTF8.GetBytes(newStr); + + written = WriteStringHelper(writerOptions, newStr); + int escapedIndex = written.Span.IndexOf((byte)'\\'); + Assert.Equal(requiresEscaping ? (i + 1) : -1, escapedIndex); // Account for the start quote + + written = WriteUtf8StringHelper(writerOptions, sourceUtf8); + escapedIndex = written.Span.IndexOf((byte)'\\'); + Assert.Equal(requiresEscaping ? (i + 1) : -1, escapedIndex); // Account for the start quote + } + + if (dataLength != 0) + { + char[] changed = baseStr.ToCharArray(); + changed.AsSpan().Fill(replacementChar); + string newStr = new string(changed); + sourceUtf8 = Encoding.UTF8.GetBytes(newStr); + + written = WriteStringHelper(writerOptions, newStr); + int escapedIndex = written.Span.IndexOf((byte)'\\'); + Assert.Equal(requiresEscaping ? 1 : -1, escapedIndex); // Account for the start quote + + written = WriteUtf8StringHelper(writerOptions, sourceUtf8); + escapedIndex = written.Span.IndexOf((byte)'\\'); + Assert.Equal(requiresEscaping ? 1 : -1, escapedIndex); // Account for the start quote + } + } + } + + public static IEnumerable EscapingTestData + { + get + { + return new List + { + new object[] { 'a', null, false }, // ASCII not escaped + new object[] { '\u001F', null, true }, // control character within single byte range + new object[] { '\u2000', null, true }, // space character outside single byte range + new object[] { '\u00A2', null, true }, // non-ASCII but < 255 + new object[] { '\uA686', null, true }, // non-ASCII above short.MaxValue + new object[] { '\u6C49', null, true }, // non-ASCII from chinese alphabet - multibyte + new object[] { '"', null, true }, // ASCII but must always be escaped in JSON + new object[] { '\\', null, true }, // ASCII but must always be escaped in JSON + new object[] { '<', null, true }, // ASCII but escaped by default + new object[] { '>', null, true }, // ASCII but escaped by default + new object[] { '&', null, true }, // ASCII but escaped by default + new object[] { '`', null, true }, // ASCII but escaped by default + new object[] { '\'', null, true }, // ASCII but escaped by default + new object[] { '+', null, true }, // ASCII but escaped by default + + new object[] { 'a', JavaScriptEncoder.Default, false }, + new object[] { '\u001F', JavaScriptEncoder.Default, true }, + new object[] { '\u2000', JavaScriptEncoder.Default, true }, + new object[] { '\u00A2', JavaScriptEncoder.Default, true }, + new object[] { '\uA686', JavaScriptEncoder.Default, true }, + new object[] { '\u6C49', JavaScriptEncoder.Default, true }, + new object[] { '"', JavaScriptEncoder.Default, true }, + new object[] { '\\', JavaScriptEncoder.Default, true }, + new object[] { '<', JavaScriptEncoder.Default, true }, + new object[] { '>', JavaScriptEncoder.Default, true }, + new object[] { '&', JavaScriptEncoder.Default, true }, + new object[] { '`', JavaScriptEncoder.Default, true }, + new object[] { '\'', JavaScriptEncoder.Default, true }, + new object[] { '+', JavaScriptEncoder.Default, true }, + + new object[] { 'a', JavaScriptEncoder.Create(UnicodeRanges.BasicLatin), false }, + new object[] { '\u001F', JavaScriptEncoder.Create(UnicodeRanges.BasicLatin), true }, + new object[] { '\u2000', JavaScriptEncoder.Create(UnicodeRanges.BasicLatin), true }, + new object[] { '\u00A2', JavaScriptEncoder.Create(UnicodeRanges.BasicLatin), true }, + new object[] { '\uA686', JavaScriptEncoder.Create(UnicodeRanges.BasicLatin), true }, + new object[] { '\u6C49', JavaScriptEncoder.Create(UnicodeRanges.BasicLatin), true }, + new object[] { '"', JavaScriptEncoder.Create(UnicodeRanges.BasicLatin), true }, + new object[] { '\\', JavaScriptEncoder.Create(UnicodeRanges.BasicLatin), true }, + new object[] { '<', JavaScriptEncoder.Create(UnicodeRanges.BasicLatin), true }, + new object[] { '>', JavaScriptEncoder.Create(UnicodeRanges.BasicLatin), true }, + new object[] { '&', JavaScriptEncoder.Create(UnicodeRanges.BasicLatin), true }, + new object[] { '`', JavaScriptEncoder.Create(UnicodeRanges.BasicLatin), true }, + new object[] { '\'', JavaScriptEncoder.Create(UnicodeRanges.BasicLatin), true }, + new object[] { '+', JavaScriptEncoder.Create(UnicodeRanges.BasicLatin), true }, + + new object[] { 'a', JavaScriptEncoder.Create(UnicodeRanges.All), false }, + new object[] { '\u001F', JavaScriptEncoder.Create(UnicodeRanges.All), true }, + new object[] { '\u2000', JavaScriptEncoder.Create(UnicodeRanges.All), true }, + new object[] { '\u00A2', JavaScriptEncoder.Create(UnicodeRanges.All), false }, + new object[] { '\uA686', JavaScriptEncoder.Create(UnicodeRanges.All), false }, + new object[] { '\u6C49', JavaScriptEncoder.Create(UnicodeRanges.All), false }, + new object[] { '"', JavaScriptEncoder.Create(UnicodeRanges.All), true }, + new object[] { '\\', JavaScriptEncoder.Create(UnicodeRanges.All), true }, + new object[] { '<', JavaScriptEncoder.Create(UnicodeRanges.All), true }, + new object[] { '>', JavaScriptEncoder.Create(UnicodeRanges.All), true }, + new object[] { '&', JavaScriptEncoder.Create(UnicodeRanges.All), true }, + new object[] { '`', JavaScriptEncoder.Create(UnicodeRanges.All), true }, + new object[] { '\'', JavaScriptEncoder.Create(UnicodeRanges.All), true }, + new object[] { '+', JavaScriptEncoder.Create(UnicodeRanges.All), true }, + + new object[] { 'a', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, false }, + new object[] { '\u001F', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, true }, + new object[] { '\u2000', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, true }, + new object[] { '\u00A2', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, false }, + new object[] { '\uA686', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, false }, + new object[] { '\u6C49', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, false }, + new object[] { '"', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, true }, + new object[] { '\\', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, true }, + new object[] { '<', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, false }, + new object[] { '>', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, false }, + new object[] { '&', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, false }, + new object[] { '`', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, false }, + new object[] { '\'', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, false }, + new object[] { '+', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, false }, + }; + } + } + + [Theory] + [MemberData(nameof(EscapingTestData_NonAscii))] + public unsafe void WriteString_NonAscii(char replacementChar, JavaScriptEncoder encoder, bool requiresEscaping) + { + var writerOptions = new JsonWriterOptions { Encoder = encoder }; + var random = new Random(42); + for (int dataLength = 1; dataLength < 50; dataLength++) + { + char[] str = new char[dataLength]; + for (int i = 0; i < dataLength; i++) + { + str[i] = (char)random.Next(0x2E9B, 0x2EF4); // CJK Radicals Supplement characters + } + string baseStr = new string(str); + byte[] sourceUtf8 = Encoding.UTF8.GetBytes(baseStr); + + ReadOnlyMemory written = WriteStringHelper(writerOptions, baseStr); + Assert.Equal(-1, written.Span.IndexOf((byte)'\\')); + + written = WriteUtf8StringHelper(writerOptions, sourceUtf8); + Assert.Equal(-1, written.Span.IndexOf((byte)'\\')); + + for (int i = 0; i < dataLength; i++) + { + string source = baseStr.Insert(i, new string(replacementChar, 1)); + sourceUtf8 = Encoding.UTF8.GetBytes(source); + + written = WriteStringHelper(writerOptions, source); + int escapedIndex = written.Span.IndexOf((byte)'\\'); + // Each CJK character expands to 3 utf-8 bytes. + Assert.Equal(requiresEscaping ? ((i * 3) + 1) : -1, escapedIndex); // Account for the start quote + + written = WriteUtf8StringHelper(writerOptions, sourceUtf8); + escapedIndex = written.Span.IndexOf((byte)'\\'); + // Each CJK character expands to 3 utf-8 bytes. + Assert.Equal(requiresEscaping ? ((i * 3) + 1) : -1, escapedIndex); // Account for the start quote + } + } + } + + public static IEnumerable EscapingTestData_NonAscii + { + get + { + return new List + { + new object[] { 'a', JavaScriptEncoder.Create(UnicodeRanges.All), false }, + new object[] { '\u001F', JavaScriptEncoder.Create(UnicodeRanges.All), true }, + new object[] { '\u2000', JavaScriptEncoder.Create(UnicodeRanges.All), true }, + new object[] { '\u00A2', JavaScriptEncoder.Create(UnicodeRanges.All), false }, + new object[] { '\uA686', JavaScriptEncoder.Create(UnicodeRanges.All), false }, + new object[] { '\u6C49', JavaScriptEncoder.Create(UnicodeRanges.All), false }, + new object[] { '"', JavaScriptEncoder.Create(UnicodeRanges.All), true }, + new object[] { '\\', JavaScriptEncoder.Create(UnicodeRanges.All), true }, + new object[] { '<', JavaScriptEncoder.Create(UnicodeRanges.All), true }, + new object[] { '>', JavaScriptEncoder.Create(UnicodeRanges.All), true }, + new object[] { '&', JavaScriptEncoder.Create(UnicodeRanges.All), true }, + new object[] { '`', JavaScriptEncoder.Create(UnicodeRanges.All), true }, + new object[] { '\'', JavaScriptEncoder.Create(UnicodeRanges.All), true }, + new object[] { '+', JavaScriptEncoder.Create(UnicodeRanges.All), true }, + + new object[] { 'a', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, false }, + new object[] { '\u001F', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, true }, + new object[] { '\u2000', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, true }, + new object[] { '\u00A2', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, false }, + new object[] { '\uA686', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, false }, + new object[] { '\u6C49', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, false }, + new object[] { '"', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, true }, + new object[] { '\\', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, true }, + new object[] { '<', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, false }, + new object[] { '>', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, false }, + new object[] { '&', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, false }, + new object[] { '`', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, false }, + new object[] { '\'', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, false }, + new object[] { '+', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, false }, + }; + } + } + + [Theory] + [MemberData(nameof(JavaScriptEncoders))] + public void EscapingTestWhileWritingSurrogate(JavaScriptEncoder encoder) + { + char highSurrogate = '\uD801'; + char lowSurrogate = '\uDC37'; + var writerOptions = new JsonWriterOptions { Encoder = encoder }; + var random = new Random(42); + for (int dataLength = 2; dataLength < 50; dataLength++) + { + char[] str = new char[dataLength]; + for (int i = 0; i < dataLength; i++) + { + str[i] = (char)random.Next(97, 123); + } + string baseStr = new string(str); + byte[] sourceUtf8 = Encoding.UTF8.GetBytes(baseStr); + + ReadOnlyMemory written = WriteStringHelper(writerOptions, baseStr); + Assert.Equal(-1, written.Span.IndexOf((byte)'\\')); + + written = WriteUtf8StringHelper(writerOptions, sourceUtf8); + Assert.Equal(-1, written.Span.IndexOf((byte)'\\')); + + for (int i = 0; i < dataLength - 1; i++) + { + char[] changed = baseStr.ToCharArray(); + changed[i] = highSurrogate; + changed[i + 1] = lowSurrogate; + string newStr = new string(changed); + sourceUtf8 = Encoding.UTF8.GetBytes(newStr); + + written = WriteStringHelper(writerOptions, newStr); + int escapedIndex = written.Span.IndexOf((byte)'\\'); + Assert.Equal(i + 1, escapedIndex); // Account for the start quote + + written = WriteUtf8StringHelper(writerOptions, sourceUtf8); + escapedIndex = written.Span.IndexOf((byte)'\\'); + Assert.Equal(i + 1, escapedIndex); // Account for the start quote + } + + { + char[] changed = baseStr.ToCharArray(); + + for (int i = 0; i < changed.Length - 1; i += 2) + { + changed[i] = highSurrogate; + changed[i + 1] = lowSurrogate; + } + + string newStr = new string(changed); + sourceUtf8 = Encoding.UTF8.GetBytes(newStr); + + written = WriteStringHelper(writerOptions, newStr); + int escapedIndex = written.Span.IndexOf((byte)'\\'); + Assert.Equal(1, escapedIndex); // Account for the start quote + + written = WriteUtf8StringHelper(writerOptions, sourceUtf8); + escapedIndex = written.Span.IndexOf((byte)'\\'); + Assert.Equal(1, escapedIndex); // Account for the start quote + } + } + } + + public static IEnumerable JavaScriptEncoders + { + get + { + return new List + { + new object[] { null }, + new object[] { JavaScriptEncoder.Default }, + new object[] { JavaScriptEncoder.Create(UnicodeRanges.BasicLatin) }, + new object[] { JavaScriptEncoder.Create(UnicodeRanges.All) }, + new object[] { JavaScriptEncoder.UnsafeRelaxedJsonEscaping }, + }; + } + } + + [Theory] + [MemberData(nameof(InvalidEscapingTestData))] + public unsafe void WriteStringInvalidCharacter(char replacementChar, JavaScriptEncoder encoder, bool requiresEscaping) + { + var writerOptions = new JsonWriterOptions { Encoder = encoder }; + var random = new Random(42); + for (int dataLength = 0; dataLength < 47; dataLength++) + { + char[] str = new char[dataLength]; + for (int i = 0; i < dataLength; i++) + { + str[i] = (char)random.Next(97, 123); + } + string baseStr = new string(str); + byte[] baseStrUtf8 = Encoding.UTF8.GetBytes(baseStr); + + for (int i = 0; i < dataLength; i++) + { + char[] changed = baseStr.ToCharArray(); + changed[i] = replacementChar; + string source = new string(changed); + byte[] sourceUtf8 = new byte[baseStrUtf8.Length]; + baseStrUtf8.AsSpan().CopyTo(sourceUtf8); + sourceUtf8[i] = 0xC3; // Invalid, first byte of a 2-byte utf-8 character + + ReadOnlyMemory written = WriteStringHelper(writerOptions, source); + // Some encoders don't escape replacement character + Assert.Equal(requiresEscaping ? i + 1 : -1, written.Span.IndexOf((byte)'\\')); // Account for the start quote + + written = WriteUtf8StringHelper(writerOptions, sourceUtf8); + // Some encoders don't escape replacement character + Assert.Equal(requiresEscaping ? i + 1 : -1, written.Span.IndexOf((byte)'\\')); // Account for the start quote + } + } + } + + public static IEnumerable InvalidEscapingTestData + { + get + { + return new List + { + new object[] { '\uD801', JavaScriptEncoder.Default, true }, // Invalid, high surrogate alone + new object[] { '\uDC01', JavaScriptEncoder.Default, true }, // Invalid, low surrogate alone + + new object[] { '\uD801', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, false }, + new object[] { '\uDC01', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, false }, + + new object[] { '\uD801', JavaScriptEncoder.Create(UnicodeRanges.All), false }, + new object[] { '\uDC01', JavaScriptEncoder.Create(UnicodeRanges.All), false }, + + new object[] { '\uD801', JavaScriptEncoder.Create(UnicodeRanges.BasicLatin), true }, + new object[] { '\uDC01', JavaScriptEncoder.Create(UnicodeRanges.BasicLatin), true }, + }; + } + } + + private static ReadOnlyMemory WriteStringHelper(JsonWriterOptions writerOptions, string str) + { + var output = new ArrayBufferWriter(); + using (var writer = new Utf8JsonWriter(output, writerOptions)) + { + writer.WriteStringValue(str); + } + return output.WrittenMemory; + } + + private static ReadOnlyMemory WriteUtf8StringHelper(JsonWriterOptions writerOptions, byte[] utf8str) + { + var output = new ArrayBufferWriter(); + using (var writer = new Utf8JsonWriter(output, writerOptions)) + { + writer.WriteStringValue(utf8str); + } + return output.WrittenMemory; + } + [Fact] public void WriteJsonWritesToIBWOnDemand_Dispose() {