From 748ad60dff0b14f4aabe08f140ac74c73709cbec Mon Sep 17 00:00:00 2001 From: Levi Broderick Date: Fri, 21 Jun 2019 20:38:05 -0700 Subject: [PATCH] Add UTF-8 support to TextEncoder (dotnet/corefx#38356) Commit migrated from https://github.com/dotnet/corefx/commit/9b486c1ea7bcfe7d09206ae449bffb049754ade5 --- .../src/Common/TextEncoderExtensions.cs | 71 ++++++ .../src/Properties/InternalsVisibleTo.cs | 10 + .../src/System.Text.Encodings.Web.csproj | 11 + .../src/System/Text/Encodings/Web/TextEncoder.cs | 235 +++++++++++++++++++ .../src/System/Text/Unicode/UnicodeHelpers.cs | 254 +++++++++++++++++++++ .../tests/CommonTestEncoder.cs | 105 --------- .../tests/ConfigurableScalarTextEncoder.cs | 38 +++ .../tests/System.Text.Encodings.Web.Tests.csproj | 17 +- .../tests/TextEncoderTests.cs | 242 +++++++++++++++++++- 9 files changed, 870 insertions(+), 113 deletions(-) create mode 100644 src/libraries/System.Text.Encodings.Web/src/Common/TextEncoderExtensions.cs create mode 100644 src/libraries/System.Text.Encodings.Web/src/Properties/InternalsVisibleTo.cs delete mode 100644 src/libraries/System.Text.Encodings.Web/tests/CommonTestEncoder.cs create mode 100644 src/libraries/System.Text.Encodings.Web/tests/ConfigurableScalarTextEncoder.cs diff --git a/src/libraries/System.Text.Encodings.Web/src/Common/TextEncoderExtensions.cs b/src/libraries/System.Text.Encodings.Web/src/Common/TextEncoderExtensions.cs new file mode 100644 index 0000000..6f7f7fe --- /dev/null +++ b/src/libraries/System.Text.Encodings.Web/src/Common/TextEncoderExtensions.cs @@ -0,0 +1,71 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System.Buffers; +using System.Diagnostics; +using System.Reflection; +using System.Runtime.CompilerServices; + +namespace System.Text.Encodings.Web +{ + /// + /// Provides access to APIs that aren't part of the ref asms. + /// + internal static class TextEncoderExtensions + { + private delegate OperationStatus EncodeUtf8Del(TextEncoder encoder, ReadOnlySpan utf8Source, Span utf8Destination, out int bytesConsumed, out int bytesWritten, bool isFinalBlock); + private delegate int FindFirstCharacterToEncodeUtf8Del(TextEncoder encoder, ReadOnlySpan utf8Text); + + private static readonly EncodeUtf8Del s_encodeUtf8Fn = CreateEncodeUtf8Fn(); + private static readonly FindFirstCharacterToEncodeUtf8Del s_findFirstCharToEncodeUtf8Fn = CreateFindFirstCharToEncodeUtf8Fn(); + + private static EncodeUtf8Del CreateEncodeUtf8Fn() + { + // Locate the shim method, which is able to perform fast virtual dispatch, + // then create a delegate to it. + + MethodInfo methodInfo = typeof(TextEncoder).GetMethod("EncodeUtf8Shim", BindingFlags.NonPublic | BindingFlags.Static); + Debug.Assert(methodInfo != null); + EncodeUtf8Del del = (EncodeUtf8Del)methodInfo.CreateDelegate(typeof(EncodeUtf8Del)); + + // Now invoke the delegate once. The reason for this is that the delegate probably + // points to the pre-jit stub rather than the final codegen for the method, which + // means that invocations of this delegate will incur an unnecessary call back into + // the VM. Invoking the delegate forces JIT to take place now, so a future delegate + // will point directly to the codegen rather than the pre-jit stub. + + del(HtmlEncoder.Default, ReadOnlySpan.Empty, Span.Empty, out _, out _, false); + + // Now create the delegate again and return it to the caller. + // The delegate should now be pointing directly to the static method's codegen. + + return (EncodeUtf8Del)methodInfo.CreateDelegate(typeof(EncodeUtf8Del)); + } + + private static FindFirstCharacterToEncodeUtf8Del CreateFindFirstCharToEncodeUtf8Fn() + { + // See the comments in CreateEncodeUtf8Fn for an overview of how this logic works. + + MethodInfo methodInfo = typeof(TextEncoder).GetMethod("FindFirstCharacterToEncodeUtf8Shim", BindingFlags.NonPublic | BindingFlags.Static); + Debug.Assert(methodInfo != null); + + FindFirstCharacterToEncodeUtf8Del del = (FindFirstCharacterToEncodeUtf8Del)methodInfo.CreateDelegate(typeof(FindFirstCharacterToEncodeUtf8Del)); + del(HtmlEncoder.Default, ReadOnlySpan.Empty); + + return (FindFirstCharacterToEncodeUtf8Del)methodInfo.CreateDelegate(typeof(FindFirstCharacterToEncodeUtf8Del)); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static OperationStatus EncodeUtf8(this TextEncoder encoder, ReadOnlySpan utf8Source, Span utf8Destination, out int bytesConsumed, out int bytesWritten, bool isFinalBlock = true) + { + return s_encodeUtf8Fn(encoder, utf8Source, utf8Destination, out bytesConsumed, out bytesWritten, isFinalBlock); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static int FindFirstCharacterToEncodeUtf8(this TextEncoder encoder, ReadOnlySpan utf8Text) + { + return s_findFirstCharToEncodeUtf8Fn(encoder, utf8Text); + } + } +} diff --git a/src/libraries/System.Text.Encodings.Web/src/Properties/InternalsVisibleTo.cs b/src/libraries/System.Text.Encodings.Web/src/Properties/InternalsVisibleTo.cs new file mode 100644 index 0000000..21107aa --- /dev/null +++ b/src/libraries/System.Text.Encodings.Web/src/Properties/InternalsVisibleTo.cs @@ -0,0 +1,10 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System.Runtime.CompilerServices; + +// This attribute only exists to prevent the build system from tree shaking +// internal members out of the final compiled binary. + +[assembly: InternalsVisibleTo("System.Text.Encodings.Web.Tests, PublicKey=00240000048000009400000006020000002400005253413100040000010001004b86c4cb78549b34bab61a3b1800e23bfeb5b3ec390074041536a7e3cbd97f5f04cf0f857155a8928eaa29ebfd11cfbbad3ba70efea7bda3226c6a8d370a4cd303f714486b6ebc225985a638471e6ef571cc92a4613c00b8fa65d61ccee0cbe5f36330c9a01f4183559f1bef24cc2917c6d913e3a541333a1d05d9bed22b38cb")] diff --git a/src/libraries/System.Text.Encodings.Web/src/System.Text.Encodings.Web.csproj b/src/libraries/System.Text.Encodings.Web/src/System.Text.Encodings.Web.csproj index d918978..e27938f 100644 --- a/src/libraries/System.Text.Encodings.Web/src/System.Text.Encodings.Web.csproj +++ b/src/libraries/System.Text.Encodings.Web/src/System.Text.Encodings.Web.csproj @@ -6,6 +6,9 @@ netcoreapp-Debug;netcoreapp-Release;netstandard-Debug;netstandard-Release;uap-Windows_NT-Debug;uap-Windows_NT-Release + + + @@ -20,6 +23,14 @@ + + System\Text\UnicodeDebug.cs + + + System\Text\UnicodeUtility.cs + + + diff --git a/src/libraries/System.Text.Encodings.Web/src/System/Text/Encodings/Web/TextEncoder.cs b/src/libraries/System.Text.Encodings.Web/src/System/Text/Encodings/Web/TextEncoder.cs index b98268f..b9eddfc 100644 --- a/src/libraries/System.Text.Encodings.Web/src/System/Text/Encodings/Web/TextEncoder.cs +++ b/src/libraries/System.Text.Encodings.Web/src/System/Text/Encodings/Web/TextEncoder.cs @@ -2,10 +2,12 @@ // The .NET Foundation licenses this file to you under the MIT license. // See the LICENSE file in the project root for more information. +using System.Buffers; using System.ComponentModel; using System.Diagnostics; using System.IO; using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; using System.Text.Unicode; namespace System.Text.Encodings.Web @@ -322,6 +324,190 @@ namespace System.Text.Encodings.Web } } + /// + /// Encodes the supplied UTF-8 text. + /// + /// A source buffer containing the UTF-8 text to encode. + /// The destination buffer to which the encoded form of + /// will be written. + /// The number of bytes consumed from the buffer. + /// The number of bytes written to the buffer. + /// if there is further source data that needs to be encoded; + /// if there is no further source data that needs to be encoded. + /// An describing the result of the encoding operation. + /// The buffers and must not overlap. + internal unsafe virtual OperationStatus EncodeUtf8(ReadOnlySpan utf8Source, Span utf8Destination, out int bytesConsumed, out int bytesWritten, bool isFinalBlock = true) + { + // Optimization: Detect how much "doesn't require escaping" data exists at the beginning of the buffer, + // and memcpy it directly to the destination. + + int numBytesToCopy = FindFirstCharacterToEncodeUtf8(utf8Source); + if (numBytesToCopy < 0) + { + numBytesToCopy = utf8Source.Length; + } + + if (!utf8Source.Slice(0, numBytesToCopy).TryCopyTo(utf8Destination)) + { + // There wasn't enough room in the destination to copy over the entire source buffer. + // We'll instead copy over as much as we can and return DestinationTooSmall. We do need to + // account for the fact that we don't want to truncate a multi-byte UTF-8 subsequence + // mid-sequence (since a subsequent slice and call to EncodeUtf8 would produce invalid + // data). + + utf8Source = utf8Source.Slice(0, utf8Destination.Length + 1); // guaranteed not to fail since utf8Source is larger than utf8Destination + for (int i = utf8Source.Length - 1; i >= 0; i--) + { + if (!UnicodeHelpers.IsUtf8ContinuationByte(in utf8Source[i])) + { + utf8Source.Slice(0, i).CopyTo(utf8Destination); + bytesConsumed = i; + bytesWritten = i; + return OperationStatus.DestinationTooSmall; + } + } + + // If we got to this point, either somebody mutated the input buffer out from under us, or + // the FindFirstCharacterToEncodeUtf8 method was overridden incorrectly such that it attempted + // to skip over ill-formed data. In either case we don't know how to perform a partial memcpy + // so we shouldn't do anything at all. We'll return DestinationTooSmall here since the caller + // can resolve the issue by increasing the size of the destination buffer so that it's at least + // as large as the input buffer, which would skip over this entire code path. + + bytesConsumed = 0; + bytesWritten = 0; + return OperationStatus.DestinationTooSmall; + } + + // If we copied over all of the input data, success! + + if (numBytesToCopy == utf8Source.Length) + { + bytesConsumed = numBytesToCopy; + bytesWritten = numBytesToCopy; + return OperationStatus.Done; + } + + // There's data that must be encoded. Fall back to the scalar-by-scalar slow path. + + int originalUtf8SourceLength = utf8Source.Length; + int originalUtf8DestinationLength = utf8Destination.Length; + + utf8Source = utf8Source.Slice(numBytesToCopy); + utf8Destination = utf8Destination.Slice(numBytesToCopy); + + const int TempUtf16CharBufferLength = 24; // arbitrarily chosen, but sufficient for any reasonable implementation + char* pTempCharBuffer = stackalloc char[TempUtf16CharBufferLength]; + + const int TempUtf8ByteBufferLength = TempUtf16CharBufferLength * 3 /* max UTF-8 output code units per UTF-16 input code unit */; + byte* pTempUtf8Buffer = stackalloc byte[TempUtf8ByteBufferLength]; + + while (!utf8Source.IsEmpty) + { + OperationStatus opStatus = UnicodeHelpers.DecodeScalarValueFromUtf8(utf8Source, out uint nextScalarValue, out int bytesConsumedThisIteration); + + switch (opStatus) + { + case OperationStatus.Done: + + if (WillEncode((int)nextScalarValue)) + { + goto default; // source data must be transcoded + } + else + { + // Source data can be copied as-is. Attempt to memcpy it to the destination buffer. + + if (utf8Source.Slice(0, bytesConsumedThisIteration).TryCopyTo(utf8Destination)) + { + utf8Destination = utf8Destination.Slice(bytesConsumedThisIteration); + } + else + { + goto ReturnDestinationTooSmall; + } + } + + break; + + case OperationStatus.NeedMoreData: + + if (isFinalBlock) + { + goto default; // treat this as a normal invalid subsequence + } + else + { + goto ReturnNeedMoreData; + } + + default: + + // This code path is hit for ill-formed input data (where decoding has replaced it with U+FFFD) + // and for well-formed input data that must be escaped. + + if (TryEncodeUnicodeScalar((int)nextScalarValue, pTempCharBuffer, TempUtf16CharBufferLength, out int charsWrittenJustNow)) + { + // Now that we have it as UTF-16, transcode it to UTF-8. + // Need to copy it to a temporary buffer first, otherwise GetBytes might throw an exception + // due to lack of output space. + + int transcodedByteCountThisIteration = Encoding.UTF8.GetBytes(pTempCharBuffer, charsWrittenJustNow, pTempUtf8Buffer, TempUtf8ByteBufferLength); + ReadOnlySpan transcodedUtf8BytesThisIteration = new ReadOnlySpan(pTempUtf8Buffer, transcodedByteCountThisIteration); + + if (!transcodedUtf8BytesThisIteration.TryCopyTo(utf8Destination)) + { + goto ReturnDestinationTooSmall; + } + + utf8Destination = utf8Destination.Slice(transcodedByteCountThisIteration); // advance destination buffer + } + else + { + // We really don't expect this to fail. If that happens we'll report an error to our caller. + + goto ReturnInvalidData; + } + + break; + } + + utf8Source = utf8Source.Slice(bytesConsumedThisIteration); + } + + // Input buffer has been fully processed! + + bytesConsumed = originalUtf8SourceLength; + bytesWritten = originalUtf8DestinationLength - utf8Destination.Length; + return OperationStatus.Done; + + ReturnDestinationTooSmall: + + bytesConsumed = originalUtf8SourceLength - utf8Source.Length; + bytesWritten = originalUtf8DestinationLength - utf8Destination.Length; + return OperationStatus.DestinationTooSmall; + + ReturnNeedMoreData: + + bytesConsumed = originalUtf8SourceLength - utf8Source.Length; + bytesWritten = originalUtf8DestinationLength - utf8Destination.Length; + return OperationStatus.NeedMoreData; + + ReturnInvalidData: + + bytesConsumed = originalUtf8SourceLength - utf8Source.Length; + bytesWritten = originalUtf8DestinationLength - utf8Destination.Length; + return OperationStatus.InvalidData; + } + + /// + /// Shim function which can call virtual method using fast dispatch. + /// + internal static OperationStatus EncodeUtf8Shim(TextEncoder encoder, ReadOnlySpan utf8Source, Span utf8Destination, out int bytesConsumed, out int bytesWritten, bool isFinalBlock) + { + return encoder.EncodeUtf8(utf8Source, utf8Destination, out bytesConsumed, out bytesWritten, isFinalBlock); + } + private unsafe void EncodeCore(TextWriter output, char* value, int valueLength) { Debug.Assert(value != null & output != null); @@ -383,6 +569,55 @@ namespace System.Text.Encodings.Web } } + private unsafe int FindFirstCharacterToEncode(ReadOnlySpan text) + { + fixed (char* pText = &MemoryMarshal.GetReference(text)) + { + return FindFirstCharacterToEncode(pText, text.Length); + } + } + + /// + /// Given a UTF-8 text input buffer, finds the first element in the input buffer which would be + /// escaped by the current encoder instance. + /// + /// The UTF-8 text input buffer to search. + /// + /// The index of the first element in which would be escaped by the + /// current encoder instance, or -1 if no data in requires escaping. + /// + [EditorBrowsable(EditorBrowsableState.Never)] + internal virtual int FindFirstCharacterToEncodeUtf8(ReadOnlySpan utf8Text) + { + int originalUtf8TextLength = utf8Text.Length; + + // Loop through the input text, terminating when we see ill-formed UTF-8 or when we decode a scalar value + // that must be encoded. If we see either of these things then we'll return its index in the original + // input sequence. If we consume the entire text without seeing either of these, return -1 to indicate + // that the text can be copied as-is without escaping. + + while (!utf8Text.IsEmpty) + { + if (UnicodeHelpers.DecodeScalarValueFromUtf8(utf8Text, out uint nextScalarValue, out int bytesConsumedThisIteration) != OperationStatus.Done + || WillEncode((int)nextScalarValue)) + { + return originalUtf8TextLength - utf8Text.Length; + } + + utf8Text = utf8Text.Slice(bytesConsumedThisIteration); + } + + return -1; // no input data needs to be escaped + } + + /// + /// Shim function which can call virtual method using fast dispatch. + /// + internal static int FindFirstCharacterToEncodeUtf8Shim(TextEncoder encoder, ReadOnlySpan utf8Text) + { + return encoder.FindFirstCharacterToEncodeUtf8(utf8Text); + } + internal static unsafe bool TryCopyCharacters(char[] source, char* destination, int destinationLength, out int numberOfCharactersWritten) { Debug.Assert(source != null && destination != null && destinationLength >= 0); diff --git a/src/libraries/System.Text.Encodings.Web/src/System/Text/Unicode/UnicodeHelpers.cs b/src/libraries/System.Text.Encodings.Web/src/System/Text/Unicode/UnicodeHelpers.cs index aa66f92..55162cc 100644 --- a/src/libraries/System.Text.Encodings.Web/src/System/Text/Unicode/UnicodeHelpers.cs +++ b/src/libraries/System.Text.Encodings.Web/src/System/Text/Unicode/UnicodeHelpers.cs @@ -2,6 +2,7 @@ // The .NET Foundation licenses this file to you under the MIT license. // See the LICENSE file in the project root for more information. +using System.Buffers; using System.Buffers.Binary; using System.Diagnostics; using System.Runtime.CompilerServices; @@ -47,6 +48,240 @@ namespace System.Text.Unicode } /// + /// A copy of the logic in Rune.DecodeFromUtf16. + /// + public static OperationStatus DecodeScalarValueFromUtf16(ReadOnlySpan source, out uint result, out int charsConsumed) + { + const char ReplacementChar = '\uFFFD'; + + if (!source.IsEmpty) + { + // First, check for the common case of a BMP scalar value. + // If this is correct, return immediately. + + uint firstChar = source[0]; + if (!UnicodeUtility.IsSurrogateCodePoint(firstChar)) + { + result = firstChar; + charsConsumed = 1; + return OperationStatus.Done; + } + + // First thing we saw was a UTF-16 surrogate code point. + // Let's optimistically assume for now it's a high surrogate and hope + // that combining it with the next char yields useful results. + + if (1 < (uint)source.Length) + { + uint secondChar = source[1]; + if (UnicodeUtility.IsHighSurrogateCodePoint(firstChar) && UnicodeUtility.IsLowSurrogateCodePoint(secondChar)) + { + // Success! Formed a supplementary scalar value. + result = UnicodeUtility.GetScalarFromUtf16SurrogatePair(firstChar, secondChar); + charsConsumed = 2; + return OperationStatus.Done; + } + else + { + // Either the first character was a low surrogate, or the second + // character was not a low surrogate. This is an error. + goto InvalidData; + } + } + else if (!UnicodeUtility.IsHighSurrogateCodePoint(firstChar)) + { + // Quick check to make sure we're not going to report NeedMoreData for + // a single-element buffer where the data is a standalone low surrogate + // character. Since no additional data will ever make this valid, we'll + // report an error immediately. + goto InvalidData; + } + } + + // If we got to this point, the input buffer was empty, or the buffer + // was a single element in length and that element was a high surrogate char. + + charsConsumed = source.Length; + result = ReplacementChar; + return OperationStatus.NeedMoreData; + + InvalidData: + + charsConsumed = 1; // maximal invalid subsequence for UTF-16 is always a single code unit in length + result = ReplacementChar; + return OperationStatus.InvalidData; + } + + /// + /// A copy of the logic in Rune.DecodeFromUtf8. + /// + public static OperationStatus DecodeScalarValueFromUtf8(ReadOnlySpan source, out uint result, out int bytesConsumed) + { + const char ReplacementChar = '\uFFFD'; + + // This method follows the Unicode Standard's recommendation for detecting + // the maximal subpart of an ill-formed subsequence. See The Unicode Standard, + // Ch. 3.9 for more details. In summary, when reporting an invalid subsequence, + // it tries to consume as many code units as possible as long as those code + // units constitute the beginning of a longer well-formed subsequence per Table 3-7. + + int index = 0; + + // Try reading input[0]. + + if ((uint)index >= (uint)source.Length) + { + goto NeedsMoreData; + } + + uint tempValue = source[index]; + if (!UnicodeUtility.IsAsciiCodePoint(tempValue)) + { + goto NotAscii; + } + + Finish: + + bytesConsumed = index + 1; + Debug.Assert(1 <= bytesConsumed && bytesConsumed <= 4); // Valid subsequences are always length [1..4] + result = tempValue; + return OperationStatus.Done; + + NotAscii: + + // Per Table 3-7, the beginning of a multibyte sequence must be a code unit in + // the range [C2..F4]. If it's outside of that range, it's either a standalone + // continuation byte, or it's an overlong two-byte sequence, or it's an out-of-range + // four-byte sequence. + + if (!UnicodeUtility.IsInRangeInclusive(tempValue, 0xC2, 0xF4)) + { + goto FirstByteInvalid; + } + + tempValue = (tempValue - 0xC2) << 6; + + // Try reading input[1]. + + index++; + if ((uint)index >= (uint)source.Length) + { + goto NeedsMoreData; + } + + // Continuation bytes are of the form [10xxxxxx], which means that their two's + // complement representation is in the range [-65..-128]. This allows us to + // perform a single comparison to see if a byte is a continuation byte. + + int thisByteSignExtended = (sbyte)source[index]; + if (thisByteSignExtended >= -64) + { + goto Invalid; + } + + tempValue += (uint)thisByteSignExtended; + tempValue += 0x80; // remove the continuation byte marker + tempValue += (0xC2 - 0xC0) << 6; // remove the leading byte marker + + if (tempValue < 0x0800) + { + Debug.Assert(UnicodeUtility.IsInRangeInclusive(tempValue, 0x0080, 0x07FF)); + goto Finish; // this is a valid 2-byte sequence + } + + // This appears to be a 3- or 4-byte sequence. Since per Table 3-7 we now have + // enough information (from just two code units) to detect overlong or surrogate + // sequences, we need to perform these checks now. + + if (!UnicodeUtility.IsInRangeInclusive(tempValue, ((0xE0 - 0xC0) << 6) + (0xA0 - 0x80), ((0xF4 - 0xC0) << 6) + (0x8F - 0x80))) + { + // The first two bytes were not in the range [[E0 A0]..[F4 8F]]. + // This is an overlong 3-byte sequence or an out-of-range 4-byte sequence. + goto Invalid; + } + + if (UnicodeUtility.IsInRangeInclusive(tempValue, ((0xED - 0xC0) << 6) + (0xA0 - 0x80), ((0xED - 0xC0) << 6) + (0xBF - 0x80))) + { + // This is a UTF-16 surrogate code point, which is invalid in UTF-8. + goto Invalid; + } + + if (UnicodeUtility.IsInRangeInclusive(tempValue, ((0xF0 - 0xC0) << 6) + (0x80 - 0x80), ((0xF0 - 0xC0) << 6) + (0x8F - 0x80))) + { + // This is an overlong 4-byte sequence. + goto Invalid; + } + + // The first two bytes were just fine. We don't need to perform any other checks + // on the remaining bytes other than to see that they're valid continuation bytes. + + // Try reading input[2]. + + index++; + if ((uint)index >= (uint)source.Length) + { + goto NeedsMoreData; + } + + thisByteSignExtended = (sbyte)source[index]; + if (thisByteSignExtended >= -64) + { + goto Invalid; // this byte is not a UTF-8 continuation byte + } + + tempValue <<= 6; + tempValue += (uint)thisByteSignExtended; + tempValue += 0x80; // remove the continuation byte marker + tempValue -= (0xE0 - 0xC0) << 12; // remove the leading byte marker + + if (tempValue <= 0xFFFF) + { + Debug.Assert(UnicodeUtility.IsInRangeInclusive(tempValue, 0x0800, 0xFFFF)); + goto Finish; // this is a valid 3-byte sequence + } + + // Try reading input[3]. + + index++; + if ((uint)index >= (uint)source.Length) + { + goto NeedsMoreData; + } + + thisByteSignExtended = (sbyte)source[index]; + if (thisByteSignExtended >= -64) + { + goto Invalid; // this byte is not a UTF-8 continuation byte + } + + tempValue <<= 6; + tempValue += (uint)thisByteSignExtended; + tempValue += 0x80; // remove the continuation byte marker + tempValue -= (0xF0 - 0xE0) << 18; // remove the leading byte marker + + UnicodeDebug.AssertIsValidSupplementaryPlaneScalar(tempValue); + goto Finish; // this is a valid 4-byte sequence + + FirstByteInvalid: + + index = 1; // Invalid subsequences are always at least length 1. + + Invalid: + + Debug.Assert(1 <= index && index <= 3); // Invalid subsequences are always length 1..3 + bytesConsumed = index; + result = ReplacementChar; + return OperationStatus.InvalidData; + + NeedsMoreData: + + Debug.Assert(0 <= index && index <= 3); // Incomplete subsequences are always length 0..3 + bytesConsumed = index; + result = ReplacementChar; + return OperationStatus.NeedMoreData; + } + + /// /// Returns a bitmap of all characters which are defined per the checked-in version /// of the Unicode specification. /// @@ -273,5 +508,24 @@ namespace System.Text.Unicode { return ((scalar & ~((int)char.MaxValue)) != 0); } + + /// + /// Returns iff is a UTF-8 continuation byte; + /// i.e., has binary representation 10xxxxxx, where x is any bit. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static bool IsUtf8ContinuationByte(in byte value) + { + // This API takes its input as a readonly ref so that the JIT can emit "cmp ModRM" statements + // directly rather than bounce a temporary through a register. That is, we want the JIT to be + // able to emit a single "cmp byte ptr [data], C0h" statement if we're querying a memory location + // to see if it's a continuation byte. Data that's already enregistered will go through the + // normal "cmp reg, C0h" code paths, perhaps with some extra unnecessary "movzx" instructions. + // + // The below check takes advantage of the two's complement representation of negative numbers. + // [ 0b1000_0000, 0b1011_1111 ] is [ -127 (sbyte.MinValue), -65 ] + + return ((sbyte)value < -64); + } } } diff --git a/src/libraries/System.Text.Encodings.Web/tests/CommonTestEncoder.cs b/src/libraries/System.Text.Encodings.Web/tests/CommonTestEncoder.cs deleted file mode 100644 index 7b1ca90..0000000 --- a/src/libraries/System.Text.Encodings.Web/tests/CommonTestEncoder.cs +++ /dev/null @@ -1,105 +0,0 @@ -// Licensed to the .NET Foundation under one or more agreements. -// The .NET Foundation licenses this file to you under the MIT license. -// See the LICENSE file in the project root for more information. - -using System; -using System.Globalization; -using System.IO; -using System.Runtime.CompilerServices; -using System.Text.Encodings.Web; - -namespace Microsoft.Framework.WebEncoders -{ - /// - /// Dummy encoder used for unit testing. - /// - public sealed class CommonTestEncoder : IHtmlEncoder, IJavaScriptStringEncoder, IUrlEncoder - { - /// - /// Returns "HtmlEncode[[value]]". - /// - public string HtmlEncode(string value) - { - return EncodeCore(value); - } - - /// - /// Writes "HtmlEncode[[value]]". - /// - public void HtmlEncode(string value, int startIndex, int characterCount, TextWriter output) - { - EncodeCore(value, startIndex, characterCount, output); - } - - /// - /// Writes "HtmlEncode[[value]]". - /// - public void HtmlEncode(char[] value, int startIndex, int characterCount, TextWriter output) - { - EncodeCore(value, startIndex, characterCount, output); - } - - /// - /// Returns "JavaScriptStringEncode[[value]]". - /// - public string JavaScriptStringEncode(string value) - { - return EncodeCore(value); - } - - /// - /// Writes "JavaScriptStringEncode[[value]]". - /// - public void JavaScriptStringEncode(string value, int startIndex, int characterCount, TextWriter output) - { - EncodeCore(value, startIndex, characterCount, output); - } - - /// - /// Writes "JavaScriptStringEncode[[value]]". - /// - public void JavaScriptStringEncode(char[] value, int startIndex, int characterCount, TextWriter output) - { - EncodeCore(value, startIndex, characterCount, output); - } - - /// - /// Returns "UrlEncode[[value]]". - /// - public string UrlEncode(string value) - { - return EncodeCore(value); - } - - /// - /// Writes "UrlEncode[[value]]". - /// - public void UrlEncode(string value, int startIndex, int characterCount, TextWriter output) - { - EncodeCore(value, startIndex, characterCount, output); - } - - /// - /// Writes "UrlEncode[[value]]". - /// - public void UrlEncode(char[] value, int startIndex, int characterCount, TextWriter output) - { - EncodeCore(value, startIndex, characterCount, output); - } - - private static string EncodeCore(string value, [CallerMemberName] string encodeType = null) - { - return string.Format(CultureInfo.InvariantCulture, "{0}[[{1}]]", encodeType, value); - } - - private static void EncodeCore(string value, int startIndex, int characterCount, TextWriter output, [CallerMemberName] string encodeType = null) - { - output.Write(EncodeCore(value.Substring(startIndex, characterCount), encodeType)); - } - - private static void EncodeCore(char[] value, int startIndex, int characterCount, TextWriter output, [CallerMemberName] string encodeType = null) - { - output.Write(EncodeCore(new string(value, startIndex, characterCount), encodeType)); - } - } -} diff --git a/src/libraries/System.Text.Encodings.Web/tests/ConfigurableScalarTextEncoder.cs b/src/libraries/System.Text.Encodings.Web/tests/ConfigurableScalarTextEncoder.cs new file mode 100644 index 0000000..718ffcd --- /dev/null +++ b/src/libraries/System.Text.Encodings.Web/tests/ConfigurableScalarTextEncoder.cs @@ -0,0 +1,38 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using System.Text.Encodings.Web; + +namespace Microsoft.Framework.WebEncoders +{ + /// + /// Dummy encoder used for unit testing. + /// + public sealed class ConfigurableScalarTextEncoder : TextEncoder + { + private readonly Predicate _isScalarAllowed; + + public ConfigurableScalarTextEncoder(Predicate isScalarAllowed) + { + _isScalarAllowed = isScalarAllowed; + } + + public override int MaxOutputCharactersPerInputCharacter => throw new NotImplementedException(); + + public override unsafe int FindFirstCharacterToEncode(char* text, int textLength) => throw new NotImplementedException(); + + public override bool WillEncode(int unicodeScalar) => !_isScalarAllowed(unicodeScalar); + + /// + /// Encodes scalar as an unsigned hexadecimal number (min. 4 hex digits) surrounded by square brackets: "[XXXX]". + /// + public override unsafe bool TryEncodeUnicodeScalar(int unicodeScalar, char* buffer, int bufferLength, out int numberOfCharactersWritten) + { + string encoded = FormattableString.Invariant($"[{(uint)unicodeScalar:X4}]"); + numberOfCharactersWritten = (encoded.Length <= (uint)bufferLength) ? encoded.Length : 0; + return encoded.AsSpan().TryCopyTo(new Span(buffer, bufferLength)); + } + } +} diff --git a/src/libraries/System.Text.Encodings.Web/tests/System.Text.Encodings.Web.Tests.csproj b/src/libraries/System.Text.Encodings.Web/tests/System.Text.Encodings.Web.Tests.csproj index 565079d..a2e9feb 100644 --- a/src/libraries/System.Text.Encodings.Web/tests/System.Text.Encodings.Web.Tests.csproj +++ b/src/libraries/System.Text.Encodings.Web/tests/System.Text.Encodings.Web.Tests.csproj @@ -14,14 +14,15 @@ - - - - + + + + + + - @@ -45,6 +46,12 @@ + + System\Text\UnicodeDebug.cs + + + System\Text\UnicodeUtility.cs + UnicodeData.12.1.txt diff --git a/src/libraries/System.Text.Encodings.Web/tests/TextEncoderTests.cs b/src/libraries/System.Text.Encodings.Web/tests/TextEncoderTests.cs index ec4725e..c3345e2 100644 --- a/src/libraries/System.Text.Encodings.Web/tests/TextEncoderTests.cs +++ b/src/libraries/System.Text.Encodings.Web/tests/TextEncoderTests.cs @@ -3,10 +3,11 @@ // See the LICENSE file in the project root for more information. using System; -using System.Globalization; -using System.IO; +using System.Buffers; +using System.Collections.Generic; +using System.Linq; +using System.Text; using System.Text.Encodings.Web; -using System.Text.Unicode; using Xunit; namespace Microsoft.Framework.WebEncoders @@ -41,5 +42,240 @@ namespace Microsoft.Framework.WebEncoders Assert.Equal(eX + ePair + eX, encoder.Encode(X + Pair + X)); // two iterations, block, even length Assert.Equal(ePair + eX + ePair, encoder.Encode(Pair + X + Pair)); // three iterations, no block, odd length } + + [Theory] + [InlineData(0, 0)] + [InlineData(1, 0)] + [InlineData(2, 2)] + [InlineData(3, 3)] + [InlineData(4, 3)] + [InlineData(5, 3)] + [InlineData(6, 6)] + [InlineData(7, 6)] + [InlineData(8, 6)] + [InlineData(9, 6)] + [InlineData(10, 10)] + [InlineData(11, 11)] + [InlineData(12, 11)] + public void EncodeUtf8_WellFormedInput_DoesNotRequireEncoding_CopiedToDestinationCorrectly(int destinationSize, int expectedBytesCopied) + { + // This test considers input which is well-formed and doesn't need to be encoded. + // If the destination buffer is large enough, the data should be copied in its entirety. + // If the destination buffer is too small, only complete UTF-8 subsequences should be copied. + // We should never copy a partial subsequence, as it would cause a future call to EncodeUtf8 + // to misinterpret the data as ill-formed. + + // Arrange + + byte[] fullUtf8Input = new byte[] { + 0xC2, 0x82, + 0x40, + 0xE2, 0x90, 0x91, + 0xF3, 0xA0, 0xA1, 0xA2, + 0x50 }; // UTF-8 subsequences of varying length + + var encoder = new ConfigurableScalarTextEncoder(_ => true /* allow everything */); + + // Act & assert + + OperationStatus expectedOpStatus = (expectedBytesCopied == fullUtf8Input.Length) ? OperationStatus.Done : OperationStatus.DestinationTooSmall; + + byte[] destination = new byte[destinationSize]; + Assert.Equal(expectedOpStatus, encoder.EncodeUtf8(fullUtf8Input, destination, out int bytesConsumed, out int bytesWritten, isFinalBlock: true)); + Assert.Equal(expectedBytesCopied, bytesConsumed); + Assert.Equal(expectedBytesCopied, bytesWritten); // bytes written should match bytes consumed if no encoding needs to take place + Assert.Equal(fullUtf8Input.AsSpan(0, bytesConsumed).ToArray(), destination.AsSpan(0, bytesWritten).ToArray()); // ensure byte-for-byte copy + Assert.True(destination.AsSpan(bytesWritten).ToArray().All(el => el == 0)); // all remaining bytes should be unchanged + + destination = new byte[destinationSize]; + Assert.Equal(expectedOpStatus, encoder.EncodeUtf8(fullUtf8Input, destination, out bytesConsumed, out bytesWritten, isFinalBlock: false)); + Assert.Equal(expectedBytesCopied, bytesConsumed); + Assert.Equal(expectedBytesCopied, bytesWritten); // bytes written should match bytes consumed if no encoding needs to take place + Assert.Equal(fullUtf8Input.AsSpan(0, bytesConsumed).ToArray(), destination.AsSpan(0, bytesWritten).ToArray()); // ensure byte-for-byte copy + Assert.True(destination.AsSpan(bytesWritten).ToArray().All(el => el == 0)); // all remaining bytes should be unchanged + } + + [Fact] + public void EncodeUtf8_MixedInputWhichRequiresEncodingOrReplacement() + { + // Arrange + + var fullInput = new[] + { + new { utf8Bytes = new byte[] { 0x40 }, output = "@" }, + new { utf8Bytes = new byte[] { 0xC3, 0x85 }, output = "[00C5]" }, // U+00C5 LATIN CAPITAL LETTER A WITH RING ABOVE (encoded since odd scalar value) + new { utf8Bytes = new byte[] { 0xC3, 0x86 }, output = "\u00C6" }, // U+00C6 LATIN CAPITAL LETTER AE (on allow list) + new { utf8Bytes = new byte[] { 0xFF }, output = "[FFFD]" }, // (invalid UTF-8, replaced with encoded form of U+FFFD) + new { utf8Bytes = new byte[] { 0xEF, 0xBF, 0xBD }, output = "[FFFD]" }, // U+FFFD REPLACEMENT CHARACTER (encoded since not on allow list) + new { utf8Bytes = new byte[] { 0xF0, 0x90, 0x82, 0x82 }, output = "\U00010082" }, // U+10082 LINEAR B IDEOGRAM B104 DEER (not encoded since on allow list) + new { utf8Bytes = new byte[] { 0xF0, 0x90, 0x82, 0x83 }, output = "[10083]" }, // U+10083 LINEAR B IDEOGRAM B105 EQUID (encoded since not on allow list) + }; + + var encoder = new ConfigurableScalarTextEncoder(scalarValue => (scalarValue % 2) == 0 /* allow only even-valued scalars to be represented unescaped */); + + // Act & assert + + List aggregateInputBytesSoFar = new List(); + List expectedOutputBytesSoFar = new List(); + + foreach (var entry in fullInput) + { + int aggregateInputByteCountAtStartOfLoop = aggregateInputBytesSoFar.Count; + + byte[] destination; + int bytesConsumed, bytesWritten; + + for (int i = 0; i < entry.utf8Bytes.Length - 1; i++) + { + aggregateInputBytesSoFar.Add(entry.utf8Bytes[i]); + + // If not final block, partial encoding should say "needs more data". + // We'll try with various destination lengths just to make sure it doesn't affect result. + + foreach (int destinationLength in new[] { expectedOutputBytesSoFar.Count, expectedOutputBytesSoFar.Count + 1024 }) + { + destination = new byte[destinationLength]; + + Assert.Equal(OperationStatus.NeedMoreData, encoder.EncodeUtf8(aggregateInputBytesSoFar.ToArray(), destination, out bytesConsumed, out bytesWritten, isFinalBlock: false)); + Assert.Equal(aggregateInputByteCountAtStartOfLoop, bytesConsumed); + Assert.Equal(expectedOutputBytesSoFar.Count, bytesWritten); + Assert.Equal(expectedOutputBytesSoFar.ToArray(), new Span(destination, 0, expectedOutputBytesSoFar.Count).ToArray()); + } + + // Now try it with "isFinalBlock = true" to force the U+FFFD conversion + + destination = new byte[expectedOutputBytesSoFar.Count]; // first with not enough output space to write "[FFFD]" + + Assert.Equal(OperationStatus.DestinationTooSmall, encoder.EncodeUtf8(aggregateInputBytesSoFar.ToArray(), destination, out bytesConsumed, out bytesWritten, isFinalBlock: true)); + Assert.Equal(aggregateInputByteCountAtStartOfLoop, bytesConsumed); + Assert.Equal(expectedOutputBytesSoFar.Count, bytesWritten); + Assert.Equal(expectedOutputBytesSoFar.ToArray(), new Span(destination, 0, expectedOutputBytesSoFar.Count).ToArray()); + + destination = new byte[expectedOutputBytesSoFar.Count + 1024]; // then with enough output space to write "[FFFD]" + + Assert.Equal(OperationStatus.Done, encoder.EncodeUtf8(aggregateInputBytesSoFar.ToArray(), destination, out bytesConsumed, out bytesWritten, isFinalBlock: true)); + Assert.Equal(aggregateInputBytesSoFar.Count, bytesConsumed); + Assert.Equal(expectedOutputBytesSoFar.Count + "[FFFD]".Length, bytesWritten); + Assert.Equal(expectedOutputBytesSoFar.Concat(Encoding.UTF8.GetBytes("[FFFD]")).ToArray(), new Span(destination, 0, expectedOutputBytesSoFar.Count + "[FFFD]".Length).ToArray()); + } + + // Consume the remainder of this entry and make sure it escaped properly (if needed). + + aggregateInputBytesSoFar.Add(entry.utf8Bytes.Last()); + + // First with not enough space in the destination buffer. + + destination = new byte[expectedOutputBytesSoFar.Count + Encoding.UTF8.GetByteCount(entry.output) - 1]; + + Assert.Equal(OperationStatus.DestinationTooSmall, encoder.EncodeUtf8(aggregateInputBytesSoFar.ToArray(), destination, out bytesConsumed, out bytesWritten, isFinalBlock: true)); + Assert.Equal(aggregateInputByteCountAtStartOfLoop, bytesConsumed); + Assert.Equal(expectedOutputBytesSoFar.Count, bytesWritten); + Assert.Equal(expectedOutputBytesSoFar.ToArray(), new Span(destination, 0, expectedOutputBytesSoFar.Count).ToArray()); + + // Then with exactly enough space in the destination buffer, + // and again with more than enough space in the destination buffer. + + expectedOutputBytesSoFar.AddRange(Encoding.UTF8.GetBytes(entry.output)); + + foreach (int destinationLength in new[] { expectedOutputBytesSoFar.Count, expectedOutputBytesSoFar.Count + 1024 }) + { + destination = new byte[destinationLength]; + + Assert.Equal(OperationStatus.Done, encoder.EncodeUtf8(aggregateInputBytesSoFar.ToArray(), destination, out bytesConsumed, out bytesWritten, isFinalBlock: false)); + Assert.Equal(aggregateInputBytesSoFar.Count, bytesConsumed); + Assert.Equal(expectedOutputBytesSoFar.Count, bytesWritten); + Assert.Equal(expectedOutputBytesSoFar.ToArray(), new Span(destination, 0, expectedOutputBytesSoFar.Count).ToArray()); + } + } + } + + [Fact] + public void EncodeUtf8_EmptyInput_AlwaysSucceeds() + { + // Arrange + + var encoder = new ConfigurableScalarTextEncoder(_ => false /* disallow everything */); + + // Act & assert + + Assert.Equal(OperationStatus.Done, encoder.EncodeUtf8(ReadOnlySpan.Empty, Span.Empty, out int bytesConsumed, out int bytesWritten, isFinalBlock: true)); + Assert.Equal(0, bytesConsumed); + Assert.Equal(0, bytesWritten); + + Assert.Equal(OperationStatus.Done, encoder.EncodeUtf8(ReadOnlySpan.Empty, Span.Empty, out bytesConsumed, out bytesWritten, isFinalBlock: false)); + Assert.Equal(0, bytesConsumed); + Assert.Equal(0, bytesWritten); + } + + [Fact] + public void FindFirstCharToEncodeUtf8_EmptyInput_ReturnsNegOne() + { + // Arrange + + var encoder = new ConfigurableScalarTextEncoder(_ => false /* disallow everything */); + + // Act + + int idxOfFirstByteToEncode = encoder.FindFirstCharacterToEncodeUtf8(ReadOnlySpan.Empty); + + // Assert + + Assert.Equal(-1, idxOfFirstByteToEncode); + } + + [Fact] + public void FindFirstCharToEncodeUtf8_WellFormedData_AllCharsAllowed() + { + // Arrange + + byte[] inputBytes = Encoding.UTF8.GetBytes("\U00000040\U00000400\U00004000\U00040000"); // code units of different lengths + var encoder = new ConfigurableScalarTextEncoder(_ => true /* allow everything */); + + // Act + + int idxOfFirstByteToEncode = encoder.FindFirstCharacterToEncodeUtf8(inputBytes); + + // Assert + + Assert.Equal(-1, idxOfFirstByteToEncode); + } + + [Fact] + public void FindFirstCharToEncodeUtf8_WellFormedData_SomeCharsDisallowed() + { + // Arrange + + byte[] inputBytes = Encoding.UTF8.GetBytes("\U00000040\U00000400\U00004000\U00040000"); // code units of different lengths + var encoder = new ConfigurableScalarTextEncoder(codePoint => codePoint != 0x4000 /* disallow U+4000, allow all else */); + + // Act + + int idxOfFirstByteToEncode = encoder.FindFirstCharacterToEncodeUtf8(inputBytes); + + // Assert + + Assert.Equal(3, idxOfFirstByteToEncode); + } + + [Theory] + [InlineData(new byte[] { 0x00, 0xC0, 0x80, 0x80 }, 1)] + [InlineData(new byte[] { 0x00, 0xC2, 0x80, 0x80 }, 3)] + [InlineData(new byte[] { 0xF1, 0x80, 0x80 }, 0)] + [InlineData(new byte[] { 0xF1, 0x80, 0x80, 0x80, 0xFF }, 4)] + [InlineData(new byte[] { 0xFF, 0x80, 0x80, 0x80, 0xFF }, 0)] + public void FindFirstCharToEncodeUtf8_IllFormedData_ReturnsIndexOfIllFormedSubsequence(byte[] utf8Data, int expectedIndex) + { + // Arrange + + var encoder = new ConfigurableScalarTextEncoder(_ => true /* allow everything */); + + // Act + + int actualIndex = encoder.FindFirstCharacterToEncodeUtf8(utf8Data); + + // Assert + + Assert.Equal(expectedIndex, actualIndex); + } } } -- 2.7.4