From 1510fdb1135fac1c9fa8c1e317b16402afb2b795 Mon Sep 17 00:00:00 2001 From: Ahson Khan Date: Wed, 30 Oct 2019 11:05:12 -0700 Subject: [PATCH] Add tests for custom JavaScriptEncoder to cover the virtual code paths in TextEncoder, and address previous feedback. (dotnet/corefx#42064) * Move using directive within ifdef to make it clear when its used. * Add more tests for custom text encoder case. * Fix typo in comment gaurd -> guard * Fix up the using directives that got removed during merge conflict resolution. * Address feedback - fix 0x7F case, rename vectors to be self-documenting. Commit migrated from https://github.com/dotnet/corefx/commit/62e6ccfa09322db02d6c043226bd382debc915c4 --- .../Text/Encodings/Web/DefaultJavaScriptEncoder.cs | 6 +- .../src/System/Text/Encodings/Web/Sse2Helper.cs | 118 +++++++++--------- .../src/System/Text/Encodings/Web/TextEncoder.cs | 6 +- .../Web/UnsafeRelaxedJavaScriptEncoder.cs | 8 +- .../tests/JavaScriptStringEncoderTests.cs | 137 ++++++++++++++++----- .../Text/Json/Writer/JsonWriterHelper.Escaping.cs | 7 +- 6 files changed, 182 insertions(+), 100 deletions(-) diff --git a/src/libraries/System.Text.Encodings.Web/src/System/Text/Encodings/Web/DefaultJavaScriptEncoder.cs b/src/libraries/System.Text.Encodings.Web/src/System/Text/Encodings/Web/DefaultJavaScriptEncoder.cs index ad6ffb9..3fd3c30 100644 --- a/src/libraries/System.Text.Encodings.Web/src/System/Text/Encodings/Web/DefaultJavaScriptEncoder.cs +++ b/src/libraries/System.Text.Encodings.Web/src/System/Text/Encodings/Web/DefaultJavaScriptEncoder.cs @@ -98,8 +98,9 @@ namespace System.Text.Encodings.Web // Load the next 16 bytes. Vector128 sourceValue = Sse2.LoadVector128(startingAddress); - Vector128 mask = Sse2Helper.CreateAsciiMask(sourceValue); - int index = Sse2.MoveMask(mask); + // Check for ASCII text. Any byte that's not in the ASCII range will already be negative when + // casted to signed byte. + int index = Sse2.MoveMask(sourceValue); if (index != 0) { @@ -196,6 +197,7 @@ namespace System.Text.Encodings.Web idx += utf8BytesConsumedForScalar; } } + Debug.Assert(idx == utf8Text.Length); idx = -1; // All bytes are allowed. diff --git a/src/libraries/System.Text.Encodings.Web/src/System/Text/Encodings/Web/Sse2Helper.cs b/src/libraries/System.Text.Encodings.Web/src/System/Text/Encodings/Web/Sse2Helper.cs index 6caebd3..f36cce2 100644 --- a/src/libraries/System.Text.Encodings.Web/src/System/Text/Encodings/Web/Sse2Helper.cs +++ b/src/libraries/System.Text.Encodings.Web/src/System/Text/Encodings/Web/Sse2Helper.cs @@ -17,11 +17,17 @@ namespace System.Text.Encodings.Web { Debug.Assert(Sse2.IsSupported); - // Space ' ', anything in the control characters range, and anything above short.MaxValue but less than or equal char.MaxValue - Vector128 mask = Sse2.CompareLessThan(sourceValue, s_mask_UInt16_0x20); + // Anything in the control characters range, and anything above short.MaxValue but less than or equal char.MaxValue + // That's because anything between 32768 and 65535 (inclusive) will overflow and become negative. + Vector128 mask = Sse2.CompareLessThan(sourceValue, s_spaceMaskInt16); - mask = Sse2.Or(mask, Sse2.CompareEqual(sourceValue, s_mask_UInt16_0x22)); // Quotation Mark '"' - mask = Sse2.Or(mask, Sse2.CompareEqual(sourceValue, s_mask_UInt16_0x5C)); // Reverse Solidus '\' + mask = Sse2.Or(mask, Sse2.CompareEqual(sourceValue, s_quotationMarkMaskInt16)); + mask = Sse2.Or(mask, Sse2.CompareEqual(sourceValue, s_reverseSolidusMaskInt16)); + + // Anything above the ASCII range, and also including the leftover control character in the ASCII range - 0x7F + // When this method is called with only ASCII data, 0x7F is the only value that would meet this comparison. + // However, when called from "Default", the source could contain characters outside the ASCII range. + mask = Sse2.Or(mask, Sse2.CompareGreaterThan(sourceValue, s_tildeMaskInt16)); return mask; } @@ -31,10 +37,16 @@ namespace System.Text.Encodings.Web { Debug.Assert(Sse2.IsSupported); - Vector128 mask = Sse2.CompareLessThan(sourceValue, s_mask_SByte_0x20); // Control characters, and anything above 0x7E since sbyte.MaxValue is 0x7E + // Anything in the control characters range (except 0x7F), and anything above sbyte.MaxValue but less than or equal byte.MaxValue + // That's because anything between 128 and 255 (inclusive) will overflow and become negative. + Vector128 mask = Sse2.CompareLessThan(sourceValue, s_spaceMaskSByte); + + mask = Sse2.Or(mask, Sse2.CompareEqual(sourceValue, s_quotationMarkMaskSByte)); + mask = Sse2.Or(mask, Sse2.CompareEqual(sourceValue, s_reverseSolidusMaskSByte)); - mask = Sse2.Or(mask, Sse2.CompareEqual(sourceValue, s_mask_SByte_0x22)); // Quotation Mark " - mask = Sse2.Or(mask, Sse2.CompareEqual(sourceValue, s_mask_SByte_0x5C)); // Reverse Solidus \ + // Leftover control character in the ASCII range - 0x7F + // Since we are dealing with sbytes, 0x7F is the only value that would meet this comparison. + mask = Sse2.Or(mask, Sse2.CompareGreaterThan(sourceValue, s_tildeMaskSByte)); return mask; } @@ -46,14 +58,12 @@ namespace System.Text.Encodings.Web Vector128 mask = CreateEscapingMask_UnsafeRelaxedJavaScriptEncoder(sourceValue); - mask = Sse2.Or(mask, Sse2.CompareEqual(sourceValue, s_mask_UInt16_0x26)); // Ampersand '&' - mask = Sse2.Or(mask, Sse2.CompareEqual(sourceValue, s_mask_UInt16_0x27)); // Apostrophe ''' - mask = Sse2.Or(mask, Sse2.CompareEqual(sourceValue, s_mask_UInt16_0x2B)); // Plus sign '+' - mask = Sse2.Or(mask, Sse2.CompareEqual(sourceValue, s_mask_UInt16_0x3C)); // Less Than Sign '<' - mask = Sse2.Or(mask, Sse2.CompareEqual(sourceValue, s_mask_UInt16_0x3E)); // Greater Than Sign '>' - mask = Sse2.Or(mask, Sse2.CompareEqual(sourceValue, s_mask_UInt16_0x60)); // Grave Access '`' - - mask = Sse2.Or(mask, Sse2.CompareGreaterThan(sourceValue, s_mask_UInt16_0x7E)); // Tilde '~', anything above the ASCII range + mask = Sse2.Or(mask, Sse2.CompareEqual(sourceValue, s_ampersandMaskInt16)); + mask = Sse2.Or(mask, Sse2.CompareEqual(sourceValue, s_apostropheMaskInt16)); + mask = Sse2.Or(mask, Sse2.CompareEqual(sourceValue, s_plusSignMaskInt16)); + mask = Sse2.Or(mask, Sse2.CompareEqual(sourceValue, s_lessThanSignMaskInt16)); + mask = Sse2.Or(mask, Sse2.CompareEqual(sourceValue, s_greaterThanSignMaskInt16)); + mask = Sse2.Or(mask, Sse2.CompareEqual(sourceValue, s_graveAccentMaskInt16)); return mask; } @@ -65,12 +75,12 @@ namespace System.Text.Encodings.Web Vector128 mask = CreateEscapingMask_UnsafeRelaxedJavaScriptEncoder(sourceValue); - mask = Sse2.Or(mask, Sse2.CompareEqual(sourceValue, s_mask_SByte_0x26)); // Ampersand & - mask = Sse2.Or(mask, Sse2.CompareEqual(sourceValue, s_mask_SByte_0x27)); // Apostrophe ' - mask = Sse2.Or(mask, Sse2.CompareEqual(sourceValue, s_mask_SByte_0x2B)); // Plus sign + - mask = Sse2.Or(mask, Sse2.CompareEqual(sourceValue, s_mask_SByte_0x3C)); // Less Than Sign < - mask = Sse2.Or(mask, Sse2.CompareEqual(sourceValue, s_mask_SByte_0x3E)); // Greater Than Sign > - mask = Sse2.Or(mask, Sse2.CompareEqual(sourceValue, s_mask_SByte_0x60)); // Grave Access ` + mask = Sse2.Or(mask, Sse2.CompareEqual(sourceValue, s_ampersandMaskSByte)); + mask = Sse2.Or(mask, Sse2.CompareEqual(sourceValue, s_apostropheMaskSByte)); + mask = Sse2.Or(mask, Sse2.CompareEqual(sourceValue, s_plusSignMaskSByte)); + mask = Sse2.Or(mask, Sse2.CompareEqual(sourceValue, s_lessThanSignMaskSByte)); + mask = Sse2.Or(mask, Sse2.CompareEqual(sourceValue, s_greaterThanSignMaskSByte)); + mask = Sse2.Or(mask, Sse2.CompareEqual(sourceValue, s_graveAccentMaskSByte)); return mask; } @@ -80,48 +90,38 @@ namespace System.Text.Encodings.Web { Debug.Assert(Sse2.IsSupported); - Vector128 mask = Sse2.CompareLessThan(sourceValue, s_mask_UInt16_0x00); // Null, anything above short.MaxValue but less than or equal char.MaxValue - mask = Sse2.Or(mask, Sse2.CompareGreaterThan(sourceValue, s_mask_UInt16_0x7E)); // Tilde '~', anything above the ASCII range + // Anything above short.MaxValue but less than or equal char.MaxValue + // That's because anything between 32768 and 65535 (inclusive) will overflow and become negative. + Vector128 mask = Sse2.CompareLessThan(sourceValue, s_nullMaskInt16); - return mask; - } + // Anything above the ASCII range + mask = Sse2.Or(mask, Sse2.CompareGreaterThan(sourceValue, s_maxAsciiCharacterMaskInt16)); - [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static Vector128 CreateAsciiMask(Vector128 sourceValue) - { - Debug.Assert(Sse2.IsSupported); - - // Null, anything above sbyte.MaxValue but less than or equal byte.MaxValue (i.e. anything above the ASCII range) - Vector128 mask = Sse2.CompareLessThan(sourceValue, s_mask_SByte_0x00); return mask; } - private static readonly Vector128 s_mask_UInt16_0x00 = Vector128.Zero; // Null - - private static readonly Vector128 s_mask_UInt16_0x20 = Vector128.Create((short)0x20); // Space ' ' - - private static readonly Vector128 s_mask_UInt16_0x22 = Vector128.Create((short)0x22); // Quotation Mark '"' - private static readonly Vector128 s_mask_UInt16_0x26 = Vector128.Create((short)0x26); // Ampersand '&' - private static readonly Vector128 s_mask_UInt16_0x27 = Vector128.Create((short)0x27); // Apostrophe ''' - private static readonly Vector128 s_mask_UInt16_0x2B = Vector128.Create((short)0x2B); // Plus sign '+' - private static readonly Vector128 s_mask_UInt16_0x3C = Vector128.Create((short)0x3C); // Less Than Sign '<' - private static readonly Vector128 s_mask_UInt16_0x3E = Vector128.Create((short)0x3E); // Greater Than Sign '>' - private static readonly Vector128 s_mask_UInt16_0x5C = Vector128.Create((short)0x5C); // Reverse Solidus '\' - private static readonly Vector128 s_mask_UInt16_0x60 = Vector128.Create((short)0x60); // Grave Access '`' - - private static readonly Vector128 s_mask_UInt16_0x7E = Vector128.Create((short)0x7E); // Tilde '~' - - private static readonly Vector128 s_mask_SByte_0x00 = Vector128.Zero; // Null - - private static readonly Vector128 s_mask_SByte_0x20 = Vector128.Create((sbyte)0x20); // Space ' ' - - private static readonly Vector128 s_mask_SByte_0x22 = Vector128.Create((sbyte)0x22); // Quotation Mark '"' - private static readonly Vector128 s_mask_SByte_0x26 = Vector128.Create((sbyte)0x26); // Ampersand '&' - private static readonly Vector128 s_mask_SByte_0x27 = Vector128.Create((sbyte)0x27); // Apostrophe ''' - private static readonly Vector128 s_mask_SByte_0x2B = Vector128.Create((sbyte)0x2B); // Plus sign '+' - private static readonly Vector128 s_mask_SByte_0x3C = Vector128.Create((sbyte)0x3C); // Less Than Sign '<' - private static readonly Vector128 s_mask_SByte_0x3E = Vector128.Create((sbyte)0x3E); // Greater Than Sign '>' - private static readonly Vector128 s_mask_SByte_0x5C = Vector128.Create((sbyte)0x5C); // Reverse Solidus '\' - private static readonly Vector128 s_mask_SByte_0x60 = Vector128.Create((sbyte)0x60); // Grave Access '`' + private static readonly Vector128 s_nullMaskInt16 = Vector128.Zero; + private static readonly Vector128 s_spaceMaskInt16 = Vector128.Create((short)' '); + private static readonly Vector128 s_quotationMarkMaskInt16 = Vector128.Create((short)'"'); + private static readonly Vector128 s_ampersandMaskInt16 = Vector128.Create((short)'&'); + private static readonly Vector128 s_apostropheMaskInt16 = Vector128.Create((short)'\''); + private static readonly Vector128 s_plusSignMaskInt16 = Vector128.Create((short)'+'); + private static readonly Vector128 s_lessThanSignMaskInt16 = Vector128.Create((short)'<'); + private static readonly Vector128 s_greaterThanSignMaskInt16 = Vector128.Create((short)'>'); + private static readonly Vector128 s_reverseSolidusMaskInt16 = Vector128.Create((short)'\\'); + private static readonly Vector128 s_graveAccentMaskInt16 = Vector128.Create((short)'`'); + private static readonly Vector128 s_tildeMaskInt16 = Vector128.Create((short)'~'); + private static readonly Vector128 s_maxAsciiCharacterMaskInt16 = Vector128.Create((short)0x7F); // Delete control character + + private static readonly Vector128 s_spaceMaskSByte = Vector128.Create((sbyte)' '); + private static readonly Vector128 s_quotationMarkMaskSByte = Vector128.Create((sbyte)'"'); + private static readonly Vector128 s_ampersandMaskSByte = Vector128.Create((sbyte)'&'); + private static readonly Vector128 s_apostropheMaskSByte = Vector128.Create((sbyte)'\''); + private static readonly Vector128 s_plusSignMaskSByte = Vector128.Create((sbyte)'+'); + private static readonly Vector128 s_lessThanSignMaskSByte = Vector128.Create((sbyte)'<'); + private static readonly Vector128 s_greaterThanSignMaskSByte = Vector128.Create((sbyte)'>'); + private static readonly Vector128 s_reverseSolidusMaskSByte = Vector128.Create((sbyte)'\\'); + private static readonly Vector128 s_graveAccentMaskSByte = Vector128.Create((sbyte)'`'); + private static readonly Vector128 s_tildeMaskSByte = Vector128.Create((sbyte)'~'); } } diff --git a/src/libraries/System.Text.Encodings.Web/src/System/Text/Encodings/Web/TextEncoder.cs b/src/libraries/System.Text.Encodings.Web/src/System/Text/Encodings/Web/TextEncoder.cs index b1350c1..f2de4c5 100644 --- a/src/libraries/System.Text.Encodings.Web/src/System/Text/Encodings/Web/TextEncoder.cs +++ b/src/libraries/System.Text.Encodings.Web/src/System/Text/Encodings/Web/TextEncoder.cs @@ -714,8 +714,9 @@ namespace System.Text.Encodings.Web // Load the next 16 bytes. Vector128 sourceValue = Sse2.LoadVector128(startingAddress); - Vector128 mask = Sse2Helper.CreateAsciiMask(sourceValue); - int index = Sse2.MoveMask(mask); + // Check for ASCII text. Any byte that's not in the ASCII range will already be negative when + // casted to signed byte. + int index = Sse2.MoveMask(sourceValue); if (index != 0) { @@ -812,6 +813,7 @@ namespace System.Text.Encodings.Web idx += utf8BytesConsumedForScalar; } } + Debug.Assert(idx == utf8Text.Length); idx = -1; // All bytes are allowed. diff --git a/src/libraries/System.Text.Encodings.Web/src/System/Text/Encodings/Web/UnsafeRelaxedJavaScriptEncoder.cs b/src/libraries/System.Text.Encodings.Web/src/System/Text/Encodings/Web/UnsafeRelaxedJavaScriptEncoder.cs index 3258bc0..424ee54 100644 --- a/src/libraries/System.Text.Encodings.Web/src/System/Text/Encodings/Web/UnsafeRelaxedJavaScriptEncoder.cs +++ b/src/libraries/System.Text.Encodings.Web/src/System/Text/Encodings/Web/UnsafeRelaxedJavaScriptEncoder.cs @@ -153,8 +153,9 @@ namespace System.Text.Encodings.Web // Load the next 16 bytes. Vector128 sourceValue = Sse2.LoadVector128(startingAddress); - Vector128 mask = Sse2Helper.CreateAsciiMask(sourceValue); - int index = Sse2.MoveMask(mask); + // Check for ASCII text. Any byte that's not in the ASCII range will already be negative when + // casted to signed byte. + int index = Sse2.MoveMask(sourceValue); if (index != 0) { @@ -194,7 +195,7 @@ namespace System.Text.Encodings.Web else { // Check if any of the 16 bytes need to be escaped. - mask = Sse2Helper.CreateEscapingMask_UnsafeRelaxedJavaScriptEncoder(sourceValue); + Vector128 mask = Sse2Helper.CreateEscapingMask_UnsafeRelaxedJavaScriptEncoder(sourceValue); index = Sse2.MoveMask(mask); // If index == 0, that means none of the 16 bytes needed to be escaped. @@ -245,6 +246,7 @@ namespace System.Text.Encodings.Web idx += utf8BytesConsumedForScalar; } } + Debug.Assert(idx == utf8Text.Length); idx = -1; // All bytes are allowed. diff --git a/src/libraries/System.Text.Encodings.Web/tests/JavaScriptStringEncoderTests.cs b/src/libraries/System.Text.Encodings.Web/tests/JavaScriptStringEncoderTests.cs index c577aad..00ddd50 100644 --- a/src/libraries/System.Text.Encodings.Web/tests/JavaScriptStringEncoderTests.cs +++ b/src/libraries/System.Text.Encodings.Web/tests/JavaScriptStringEncoderTests.cs @@ -5,9 +5,12 @@ using System; using System.Buffers; using System.Collections.Generic; +using System.Diagnostics; using System.Globalization; using System.IO; using System.Linq; +using System.Runtime.CompilerServices; +using System.Text.Internal; using System.Text.Unicode; using Xunit; @@ -32,6 +35,7 @@ namespace System.Text.Encodings.Web.Tests [Theory] [MemberData(nameof(EscapingTestData))] + [MemberData(nameof(EscapingTestData_NonAscii))] public unsafe void FindFirstCharacterToEncode(char replacementChar, JavaScriptEncoder encoder, bool requiresEscaping) { Assert.Equal(-1, encoder.FindFirstCharacterToEncodeUtf8(default)); @@ -95,6 +99,7 @@ namespace System.Text.Encodings.Web.Tests { new object[] { 'a', JavaScriptEncoder.Default, false }, // ASCII not escaped new object[] { '\u001F', JavaScriptEncoder.Default, true }, // control character within single byte range + new object[] { '\u007F', JavaScriptEncoder.Default, true }, // control character, sbyte.MaxValue new object[] { '\u2000', JavaScriptEncoder.Default, true }, // space character outside single byte range new object[] { '\u00A2', JavaScriptEncoder.Default, true }, // non-ASCII but < 255 new object[] { '\uA686', JavaScriptEncoder.Default, true }, // non-ASCII above short.MaxValue @@ -111,6 +116,7 @@ namespace System.Text.Encodings.Web.Tests new object[] { 'a', JavaScriptEncoder.Create(UnicodeRanges.BasicLatin), false }, new object[] { '\u001F', JavaScriptEncoder.Create(UnicodeRanges.BasicLatin), true }, + new object[] { '\u007F', JavaScriptEncoder.Create(UnicodeRanges.BasicLatin), true }, new object[] { '\u2000', JavaScriptEncoder.Create(UnicodeRanges.BasicLatin), true }, new object[] { '\u00A2', JavaScriptEncoder.Create(UnicodeRanges.BasicLatin), true }, new object[] { '\uA686', JavaScriptEncoder.Create(UnicodeRanges.BasicLatin), true }, @@ -124,38 +130,6 @@ namespace System.Text.Encodings.Web.Tests new object[] { '\'', JavaScriptEncoder.Create(UnicodeRanges.BasicLatin), true }, new object[] { '+', JavaScriptEncoder.Create(UnicodeRanges.BasicLatin), true }, new object[] { '\uFFFD', JavaScriptEncoder.Create(UnicodeRanges.BasicLatin), true }, - - new object[] { 'a', JavaScriptEncoder.Create(UnicodeRanges.All), false }, - new object[] { '\u001F', JavaScriptEncoder.Create(UnicodeRanges.All), true }, - new object[] { '\u2000', JavaScriptEncoder.Create(UnicodeRanges.All), true }, - new object[] { '\u00A2', JavaScriptEncoder.Create(UnicodeRanges.All), false }, - new object[] { '\uA686', JavaScriptEncoder.Create(UnicodeRanges.All), false }, - new object[] { '\u6C49', JavaScriptEncoder.Create(UnicodeRanges.All), false }, - new object[] { '"', JavaScriptEncoder.Create(UnicodeRanges.All), true }, - new object[] { '\\', JavaScriptEncoder.Create(UnicodeRanges.All), true }, - new object[] { '<', JavaScriptEncoder.Create(UnicodeRanges.All), true }, - new object[] { '>', JavaScriptEncoder.Create(UnicodeRanges.All), true }, - new object[] { '&', JavaScriptEncoder.Create(UnicodeRanges.All), true }, - new object[] { '`', JavaScriptEncoder.Create(UnicodeRanges.All), true }, - new object[] { '\'', JavaScriptEncoder.Create(UnicodeRanges.All), true }, - new object[] { '+', JavaScriptEncoder.Create(UnicodeRanges.All), true }, - new object[] { '\uFFFD', JavaScriptEncoder.Create(UnicodeRanges.All), false }, - - new object[] { 'a', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, false }, - new object[] { '\u001F', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, true }, - new object[] { '\u2000', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, true }, - new object[] { '\u00A2', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, false }, - new object[] { '\uA686', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, false }, - new object[] { '\u6C49', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, false }, - new object[] { '"', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, true }, - new object[] { '\\', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, true }, - new object[] { '<', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, false }, - new object[] { '>', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, false }, - new object[] { '&', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, false }, - new object[] { '`', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, false }, - new object[] { '\'', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, false }, - new object[] { '+', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, false }, - new object[] { '\uFFFD', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, false }, }; } } @@ -203,6 +177,7 @@ namespace System.Text.Encodings.Web.Tests { new object[] { 'a', JavaScriptEncoder.Create(UnicodeRanges.All), false }, new object[] { '\u001F', JavaScriptEncoder.Create(UnicodeRanges.All), true }, + new object[] { '\u007F', JavaScriptEncoder.Create(UnicodeRanges.All), true }, new object[] { '\u2000', JavaScriptEncoder.Create(UnicodeRanges.All), true }, new object[] { '\u00A2', JavaScriptEncoder.Create(UnicodeRanges.All), false }, new object[] { '\uA686', JavaScriptEncoder.Create(UnicodeRanges.All), false }, @@ -219,6 +194,7 @@ namespace System.Text.Encodings.Web.Tests new object[] { 'a', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, false }, new object[] { '\u001F', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, true }, + new object[] { '\u007F', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, true }, new object[] { '\u2000', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, true }, new object[] { '\u00A2', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, false }, new object[] { '\uA686', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, false }, @@ -232,6 +208,23 @@ namespace System.Text.Encodings.Web.Tests new object[] { '\'', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, false }, new object[] { '+', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, false }, new object[] { '\uFFFD', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, false }, + + new object[] { 'a', new MyCustomEncoder(UnicodeRanges.All), false }, + new object[] { '\u001F', new MyCustomEncoder(UnicodeRanges.All), true }, + new object[] { '\u007F', new MyCustomEncoder(UnicodeRanges.All), true }, + new object[] { '\u2000', new MyCustomEncoder(UnicodeRanges.All), true }, + new object[] { '\u00A2', new MyCustomEncoder(UnicodeRanges.All), false }, + new object[] { '\uA686', new MyCustomEncoder(UnicodeRanges.All), false }, + new object[] { '\u6C49', new MyCustomEncoder(UnicodeRanges.All), false }, + new object[] { '"', new MyCustomEncoder(UnicodeRanges.All), true }, + new object[] { '\\', new MyCustomEncoder(UnicodeRanges.All), true }, + new object[] { '<', new MyCustomEncoder(UnicodeRanges.All), true }, + new object[] { '>', new MyCustomEncoder(UnicodeRanges.All), true }, + new object[] { '&', new MyCustomEncoder(UnicodeRanges.All), true }, + new object[] { '`', new MyCustomEncoder(UnicodeRanges.All), true }, + new object[] { '\'', new MyCustomEncoder(UnicodeRanges.All), true }, + new object[] { '+', new MyCustomEncoder(UnicodeRanges.All), true }, + new object[] { '\uFFFD', new MyCustomEncoder(UnicodeRanges.All), false }, }; } } @@ -305,6 +298,7 @@ namespace System.Text.Encodings.Web.Tests new object[] { JavaScriptEncoder.Create(UnicodeRanges.BasicLatin) }, new object[] { JavaScriptEncoder.Create(UnicodeRanges.All) }, new object[] { JavaScriptEncoder.UnsafeRelaxedJsonEscaping }, + new object[] { new MyCustomEncoder(UnicodeRanges.BasicLatin) }, }; } } @@ -359,10 +353,89 @@ namespace System.Text.Encodings.Web.Tests new object[] { '\uD801', JavaScriptEncoder.Create(UnicodeRanges.BasicLatin) }, new object[] { '\uDC01', JavaScriptEncoder.Create(UnicodeRanges.BasicLatin) }, + + new object[] { '\uD801', new MyCustomEncoder(UnicodeRanges.BasicLatin) }, + new object[] { '\uDC01', new MyCustomEncoder(UnicodeRanges.BasicLatin) }, }; } } + internal sealed class MyCustomEncoder : JavaScriptEncoder + { + private readonly AllowedCharactersBitmap _allowedCharacters; + + public MyCustomEncoder(TextEncoderSettings filter) + { + if (filter == null) + { + throw new ArgumentNullException(nameof(filter)); + } + + _allowedCharacters = filter.GetAllowedCharacters(); + + // Forbid codepoints which aren't mapped to characters or which are otherwise always disallowed + // (includes categories Cc, Cs, Co, Cn, Zs [except U+0020 SPACE], Zl, Zp) + _allowedCharacters.ForbidUndefinedCharacters(); + + // Forbid characters that are special in HTML. + // Even though this is a not HTML encoder, + // it's unfortunately common for developers to + // forget to HTML-encode a string once it has been JS-encoded, + // so this offers extra protection. + ForbidHtmlCharacters(_allowedCharacters); + + // '\' (U+005C REVERSE SOLIDUS) must always be escaped in Javascript / ECMAScript / JSON. + // '/' (U+002F SOLIDUS) is not Javascript / ECMAScript / JSON-sensitive so doesn't need to be escaped. + _allowedCharacters.ForbidCharacter('\\'); + + // '`' (U+0060 GRAVE ACCENT) is ECMAScript-sensitive (see ECMA-262). + _allowedCharacters.ForbidCharacter('`'); + } + + internal static void ForbidHtmlCharacters(AllowedCharactersBitmap allowedCharacters) + { + allowedCharacters.ForbidCharacter('<'); + allowedCharacters.ForbidCharacter('>'); + allowedCharacters.ForbidCharacter('&'); + allowedCharacters.ForbidCharacter('\''); // can be used to escape attributes + allowedCharacters.ForbidCharacter('\"'); // can be used to escape attributes + allowedCharacters.ForbidCharacter('+'); // technically not HTML-specific, but can be used to perform UTF7-based attacks + } + + public MyCustomEncoder(params UnicodeRange[] allowedRanges) : this(new TextEncoderSettings(allowedRanges)) + { } + + public override int MaxOutputCharactersPerInputCharacter => 12; // "\uFFFF\uFFFF" is the longest encoded form + + public override unsafe bool TryEncodeUnicodeScalar(int unicodeScalar, char* buffer, int bufferLength, out int numberOfCharactersWritten) + { + throw new NotImplementedException(); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public override unsafe int FindFirstCharacterToEncode(char* text, int textLength) + { + if (text == null) + { + throw new ArgumentNullException(nameof(text)); + } + + return _allowedCharacters.FindFirstCharacterToEncode(text, textLength); + } + + public override bool WillEncode(int unicodeScalar) + { + if (UnicodeHelpers.IsSupplementaryCodePoint(unicodeScalar)) + { + return true; + } + + Debug.Assert(unicodeScalar >= char.MinValue && unicodeScalar <= char.MaxValue); + + return !_allowedCharacters.IsUnicodeScalarAllowed(unicodeScalar); + } + } + [Fact] public void TestSurrogate() { diff --git a/src/libraries/System.Text.Json/src/System/Text/Json/Writer/JsonWriterHelper.Escaping.cs b/src/libraries/System.Text.Json/src/System/Text/Json/Writer/JsonWriterHelper.Escaping.cs index 2a51013..a6e0c8c 100644 --- a/src/libraries/System.Text.Json/src/System/Text/Json/Writer/JsonWriterHelper.Escaping.cs +++ b/src/libraries/System.Text.Json/src/System/Text/Json/Writer/JsonWriterHelper.Escaping.cs @@ -5,9 +5,12 @@ using System.Buffers; using System.Buffers.Text; using System.Diagnostics; -using System.Runtime.CompilerServices; // Do not remove. Needed for Int32LsbToHexDigit when !BUILDING_INBOX_LIBRARY using System.Text.Encodings.Web; +#if !BUILDING_INBOX_LIBRARY +using System.Runtime.CompilerServices; +#endif + namespace System.Text.Json { // TODO: Replace the escaping logic with publicly shipping APIs from https://github.com/dotnet/corefx/issues/33509 @@ -59,7 +62,7 @@ namespace System.Text.Json public static unsafe int NeedsEscaping(ReadOnlySpan value, JavaScriptEncoder encoder) { // Some implementations of JavaScriptEncoder.FindFirstCharacterToEncode may not accept - // null pointers and gaurd against that. Hence, check up-front to return -1. + // null pointers and guard against that. Hence, check up-front to return -1. if (value.IsEmpty) { return -1; -- 2.7.4