From effeb6ee79ca64376775a81ef1b2d9b507a30163 Mon Sep 17 00:00:00 2001 From: Levi Broderick Date: Tue, 2 Apr 2019 21:07:31 -0700 Subject: [PATCH] Hook up new UTF-8 logic through UTF8Encoding - Add vectorized UTF-16 validation and transcoded byte counts - Move Utf16Utility into Unicode namespace alongside Utf8Utility - Fix some bugs in DecoderNLS's draining logic --- .../shared/System.Private.CoreLib.Shared.projitems | 3 +- .../shared/System/Globalization/CompareInfo.cs | 2 +- .../shared/System/Globalization/TextInfo.cs | 1 + .../shared/System/Marvin.OrdinalIgnoreCase.cs | 2 +- .../shared/System/Text/DecoderNLS.cs | 11 +- .../shared/System/Text/Encoding.Internal.cs | 21 +- .../shared/System/Text/Rune.cs | 1 + .../shared/System/Text/UTF8Encoding.cs | 2570 +++----------------- .../System/Text/Unicode/Utf16Utility.Validation.cs | 361 +++ .../System/Text/{ => Unicode}/Utf16Utility.cs | 2 +- .../shared/System/Text/Unicode/Utf8.cs | 6 +- .../System/Text/Unicode/Utf8Utility.Transcoding.cs | 2 +- .../System/Text/Unicode/Utf8Utility.Validation.cs | 19 +- .../shared/System/Text/Unicode/Utf8Utility.cs | 2 +- 14 files changed, 808 insertions(+), 2195 deletions(-) create mode 100644 src/System.Private.CoreLib/shared/System/Text/Unicode/Utf16Utility.Validation.cs rename src/System.Private.CoreLib/shared/System/Text/{ => Unicode}/Utf16Utility.cs (99%) diff --git a/src/System.Private.CoreLib/shared/System.Private.CoreLib.Shared.projitems b/src/System.Private.CoreLib/shared/System.Private.CoreLib.Shared.projitems index 3ef9b35..4d50d9f 100644 --- a/src/System.Private.CoreLib/shared/System.Private.CoreLib.Shared.projitems +++ b/src/System.Private.CoreLib/shared/System.Private.CoreLib.Shared.projitems @@ -797,11 +797,12 @@ - + + diff --git a/src/System.Private.CoreLib/shared/System/Globalization/CompareInfo.cs b/src/System.Private.CoreLib/shared/System/Globalization/CompareInfo.cs index 97bb90f..73e4b71 100644 --- a/src/System.Private.CoreLib/shared/System/Globalization/CompareInfo.cs +++ b/src/System.Private.CoreLib/shared/System/Globalization/CompareInfo.cs @@ -7,7 +7,7 @@ using System.Reflection; using System.Runtime.CompilerServices; using System.Runtime.InteropServices; using System.Runtime.Serialization; -using System.Text; +using System.Text.Unicode; using Internal.Runtime.CompilerServices; namespace System.Globalization diff --git a/src/System.Private.CoreLib/shared/System/Globalization/TextInfo.cs b/src/System.Private.CoreLib/shared/System/Globalization/TextInfo.cs index 12ce6d9..8752628 100644 --- a/src/System.Private.CoreLib/shared/System/Globalization/TextInfo.cs +++ b/src/System.Private.CoreLib/shared/System/Globalization/TextInfo.cs @@ -7,6 +7,7 @@ using System.Runtime.CompilerServices; using System.Runtime.InteropServices; using System.Runtime.Serialization; using System.Text; +using System.Text.Unicode; using Internal.Runtime.CompilerServices; #if BIT64 diff --git a/src/System.Private.CoreLib/shared/System/Marvin.OrdinalIgnoreCase.cs b/src/System.Private.CoreLib/shared/System/Marvin.OrdinalIgnoreCase.cs index beab0cf..9e9bb31 100644 --- a/src/System.Private.CoreLib/shared/System/Marvin.OrdinalIgnoreCase.cs +++ b/src/System.Private.CoreLib/shared/System/Marvin.OrdinalIgnoreCase.cs @@ -5,7 +5,7 @@ using System.Buffers; using System.Diagnostics; using System.Runtime.InteropServices; -using System.Text; +using System.Text.Unicode; using Internal.Runtime.CompilerServices; #if BIT64 diff --git a/src/System.Private.CoreLib/shared/System/Text/DecoderNLS.cs b/src/System.Private.CoreLib/shared/System/Text/DecoderNLS.cs index 9040a94..bb5aa5f 100644 --- a/src/System.Private.CoreLib/shared/System/Text/DecoderNLS.cs +++ b/src/System.Private.CoreLib/shared/System/Text/DecoderNLS.cs @@ -266,6 +266,7 @@ namespace System.Text // to be in progress. Unlike EncoderNLS, this is simply a Debug.Assert. No exception is thrown. Debug.Assert(_fallbackBuffer is null || _fallbackBuffer.Remaining == 0, "Should have no data remaining in the fallback buffer."); + Debug.Assert(HasLeftoverData, "Caller shouldn't invoke this routine unless there's leftover data in the decoder."); // Copy the existing leftover data plus as many bytes as possible of the new incoming data // into a temporary concated buffer, then get its char count by decoding it. @@ -319,6 +320,7 @@ namespace System.Text // to be in progress. Unlike EncoderNLS, this is simply a Debug.Assert. No exception is thrown. Debug.Assert(_fallbackBuffer is null || _fallbackBuffer.Remaining == 0, "Should have no data remaining in the fallback buffer."); + Debug.Assert(HasLeftoverData, "Caller shouldn't invoke this routine unless there's leftover data in the decoder."); // Copy the existing leftover data plus as many bytes as possible of the new incoming data // into a temporary concated buffer, then transcode it from bytes to chars. @@ -370,6 +372,14 @@ namespace System.Text Finish: + // Report back the number of bytes (from the new incoming span) we consumed just now. + // This calculation is simple: it's the difference between the original leftover byte + // count and the number of bytes from the combined buffer we needed to decode the first + // scalar value. We need to report this before the call to SetLeftoverData / + // ClearLeftoverData because those methods will overwrite the _leftoverByteCount field. + + bytesConsumed = combinedBufferBytesConsumed - _leftoverByteCount; + if (persistNewCombinedBuffer) { Debug.Assert(combinedBufferBytesConsumed == combinedBuffer.Length, "We should be asked to persist the entire combined buffer."); @@ -380,7 +390,6 @@ namespace System.Text ClearLeftoverData(); // the buffer contains no partial data; we'll go down the normal paths } - bytesConsumed = combinedBufferBytesConsumed - _leftoverByteCount; // amount of 'bytes' buffer consumed just now return charsWritten; DestinationTooSmall: diff --git a/src/System.Private.CoreLib/shared/System/Text/Encoding.Internal.cs b/src/System.Private.CoreLib/shared/System/Text/Encoding.Internal.cs index 0e32167..ca740a1 100644 --- a/src/System.Private.CoreLib/shared/System/Text/Encoding.Internal.cs +++ b/src/System.Private.CoreLib/shared/System/Text/Encoding.Internal.cs @@ -850,8 +850,14 @@ namespace System.Text ReadOnlySpan bytes = new ReadOnlySpan(pOriginalBytes, originalByteCount).Slice(bytesConsumedSoFar); - int totalCharCount = decoder.DrainLeftoverDataForGetCharCount(bytes, out int bytesConsumedJustNow); - bytes = bytes.Slice(bytesConsumedJustNow); + int bytesConsumedJustNow = 0; + int totalCharCount = 0; + + if (decoder.HasLeftoverData) + { + totalCharCount = decoder.DrainLeftoverDataForGetCharCount(bytes, out bytesConsumedJustNow); + bytes = bytes.Slice(bytesConsumedJustNow); + } // Now try invoking the "fast path" (no fallback) implementation. // We can use Unsafe.AsPointer here since these spans are created from pinned data (raw pointers). @@ -1120,10 +1126,15 @@ namespace System.Text ReadOnlySpan bytes = new ReadOnlySpan(pOriginalBytes, originalByteCount).Slice(bytesConsumedSoFar); Span chars = new Span(pOriginalChars, originalCharCount).Slice(charsWrittenSoFar); - int charsWrittenJustNow = decoder.DrainLeftoverDataForGetChars(bytes, chars, out int bytesConsumedJustNow); + int bytesConsumedJustNow = 0; + int charsWrittenJustNow = 0; - bytes = bytes.Slice(bytesConsumedJustNow); - chars = chars.Slice(charsWrittenJustNow); + if (decoder.HasLeftoverData) + { + charsWrittenJustNow = decoder.DrainLeftoverDataForGetChars(bytes, chars, out bytesConsumedJustNow); + bytes = bytes.Slice(bytesConsumedJustNow); + chars = chars.Slice(charsWrittenJustNow); + } Debug.Assert(!decoder.InternalHasFallbackBuffer || decoder.FallbackBuffer.Remaining == 0, "Should be no remaining fallback data at this point."); diff --git a/src/System.Private.CoreLib/shared/System/Text/Rune.cs b/src/System.Private.CoreLib/shared/System/Text/Rune.cs index a91c0fc..a71750e 100644 --- a/src/System.Private.CoreLib/shared/System/Text/Rune.cs +++ b/src/System.Private.CoreLib/shared/System/Text/Rune.cs @@ -6,6 +6,7 @@ using System.Buffers; using System.Diagnostics; using System.Globalization; using System.Runtime.CompilerServices; +using System.Text.Unicode; namespace System.Text { diff --git a/src/System.Private.CoreLib/shared/System/Text/UTF8Encoding.cs b/src/System.Private.CoreLib/shared/System/Text/UTF8Encoding.cs index aaac975..7a3a1f7 100644 --- a/src/System.Private.CoreLib/shared/System/Text/UTF8Encoding.cs +++ b/src/System.Private.CoreLib/shared/System/Text/UTF8Encoding.cs @@ -15,9 +15,11 @@ #define FASTLOOP using System; +using System.Buffers; using System.Diagnostics; -using System.Globalization; +using System.Runtime.CompilerServices; using System.Runtime.InteropServices; +using System.Text.Unicode; namespace System.Text { @@ -129,22 +131,26 @@ namespace System.Text public override unsafe int GetByteCount(char[] chars, int index, int count) { // Validate input parameters - if (chars == null) - throw new ArgumentNullException(nameof(chars), SR.ArgumentNull_Array); - if (index < 0 || count < 0) - throw new ArgumentOutOfRangeException((index < 0 ? nameof(index) : nameof(count)), SR.ArgumentOutOfRange_NeedNonNegNum); + if (chars is null) + { + ThrowHelper.ThrowArgumentNullException(ExceptionArgument.chars, ExceptionResource.ArgumentNull_Array); + } - if (chars.Length - index < count) - throw new ArgumentOutOfRangeException(nameof(chars), SR.ArgumentOutOfRange_IndexCountBuffer); + if ((index | count) < 0) + { + ThrowHelper.ThrowArgumentOutOfRangeException((index < 0) ? ExceptionArgument.index : ExceptionArgument.count, ExceptionResource.ArgumentOutOfRange_NeedNonNegNum); + } - // If no input, return 0, avoid fixed empty array problem - if (count == 0) - return 0; + if (chars.Length - index < count) + { + ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.chars, ExceptionResource.ArgumentOutOfRange_IndexCountBuffer); + } - // Just call the pointer version fixed (char* pChars = chars) - return GetByteCount(pChars + index, count, null); + { + return GetByteCountCommon(pChars + index, count); + } } // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS) @@ -154,12 +160,17 @@ namespace System.Text public override unsafe int GetByteCount(string chars) { - // Validate input - if (chars==null) - throw new ArgumentNullException("s"); + // Validate input parameters + + if (chars is null) + { + ThrowHelper.ThrowArgumentNullException(ExceptionArgument.chars); + } fixed (char* pChars = chars) - return GetByteCount(pChars, chars.Length, null); + { + return GetByteCountCommon(pChars, chars.Length); + } } // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS) @@ -170,22 +181,78 @@ namespace System.Text public override unsafe int GetByteCount(char* chars, int count) { // Validate Parameters + if (chars == null) - throw new ArgumentNullException(nameof(chars), SR.ArgumentNull_Array); + { + ThrowHelper.ThrowArgumentNullException(ExceptionArgument.chars); + } if (count < 0) - throw new ArgumentOutOfRangeException(nameof(count), SR.ArgumentOutOfRange_NeedNonNegNum); + { + ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.count, ExceptionResource.ArgumentOutOfRange_NeedNonNegNum); + } - // Call it with empty encoder - return GetByteCount(chars, count, null); + return GetByteCountCommon(chars, count); } public override unsafe int GetByteCount(ReadOnlySpan chars) { - fixed (char* charsPtr = &MemoryMarshal.GetNonNullPinnableReference(chars)) + // It's ok for us to pass null pointers down to the workhorse below. + + fixed (char* charsPtr = &MemoryMarshal.GetReference(chars)) + { + return GetByteCountCommon(charsPtr, chars.Length); + } + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private unsafe int GetByteCountCommon(char* pChars, int charCount) + { + // Common helper method for all non-EncoderNLS entry points to GetByteCount. + // A modification of this method should be copied in to each of the supported encodings: ASCII, UTF8, UTF16, UTF32. + + Debug.Assert(charCount >= 0, "Caller should't specify negative length buffer."); + Debug.Assert(pChars != null || charCount == 0, "Input pointer shouldn't be null if non-zero length specified."); + + // First call into the fast path. + // Don't bother providing a fallback mechanism; our fast path doesn't use it. + + int totalByteCount = GetByteCountFast(pChars, charCount, fallback: null, out int charsConsumed); + + if (charsConsumed != charCount) + { + // If there's still data remaining in the source buffer, go down the fallback path. + // We need to check for integer overflow since the fallback could change the required + // output count in unexpected ways. + + totalByteCount += GetByteCountWithFallback(pChars, charCount, charsConsumed); + if (totalByteCount < 0) + { + ThrowConversionOverflow(); + } + } + + return totalByteCount; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] // called directly by GetCharCountCommon + private protected sealed override unsafe int GetByteCountFast(char* pChars, int charsLength, EncoderFallback fallback, out int charsConsumed) + { + // The number of UTF-8 code units may exceed the number of UTF-16 code units, + // so we'll need to check for overflow before casting to Int32. + + char* ptrToFirstInvalidChar = Utf16Utility.GetPointerToFirstInvalidChar(pChars, charsLength, out long utf8CodeUnitCountAdjustment, out _); + + int tempCharsConsumed = (int)(ptrToFirstInvalidChar - pChars); + charsConsumed = tempCharsConsumed; + + long totalUtf8Bytes = tempCharsConsumed + utf8CodeUnitCountAdjustment; + if ((ulong)totalUtf8Bytes > int.MaxValue) { - return GetByteCount(charsPtr, chars.Length, baseEncoder: null); + ThrowConversionOverflow(); } + + return (int)totalUtf8Bytes; } // Parent method is safe. @@ -196,22 +263,37 @@ namespace System.Text public override unsafe int GetBytes(string s, int charIndex, int charCount, byte[] bytes, int byteIndex) { - if (s == null || bytes == null) - throw new ArgumentNullException((s == null ? nameof(s) : nameof(bytes)), SR.ArgumentNull_Array); + // Validate Parameters + + if (s is null || bytes is null) + { + ThrowHelper.ThrowArgumentNullException( + argument: (s is null) ? ExceptionArgument.s : ExceptionArgument.bytes, + resource: ExceptionResource.ArgumentNull_Array); + } - if (charIndex < 0 || charCount < 0) - throw new ArgumentOutOfRangeException((charIndex < 0 ? nameof(charIndex) : nameof(charCount)), SR.ArgumentOutOfRange_NeedNonNegNum); + if ((charIndex | charCount) < 0) + { + ThrowHelper.ThrowArgumentOutOfRangeException( + argument: (charIndex < 0) ? ExceptionArgument.charIndex : ExceptionArgument.charCount, + resource: ExceptionResource.ArgumentOutOfRange_NeedNonNegNum); + } if (s.Length - charIndex < charCount) - throw new ArgumentOutOfRangeException(nameof(s), SR.ArgumentOutOfRange_IndexCount); - - if (byteIndex < 0 || byteIndex > bytes.Length) - throw new ArgumentOutOfRangeException(nameof(byteIndex), SR.ArgumentOutOfRange_Index); + { + ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.s, ExceptionResource.ArgumentOutOfRange_IndexCount); + } - int byteCount = bytes.Length - byteIndex; + if ((uint)byteIndex > bytes.Length) + { + ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.byteIndex, ExceptionResource.ArgumentOutOfRange_Index); + } - fixed (char* pChars = s) fixed (byte* pBytes = &MemoryMarshal.GetReference((Span)bytes)) - return GetBytes(pChars + charIndex, charCount, pBytes + byteIndex, byteCount, null); + fixed (char* pChars = s) + fixed (byte* pBytes = bytes) + { + return GetBytesCommon(pChars + charIndex, charCount, pBytes + byteIndex, bytes.Length - byteIndex); + } } // Encodes a range of characters in a character array into a range of bytes @@ -232,28 +314,36 @@ namespace System.Text byte[] bytes, int byteIndex) { // Validate parameters - if (chars == null || bytes == null) - throw new ArgumentNullException((chars == null ? nameof(chars) : nameof(bytes)), SR.ArgumentNull_Array); - - if (charIndex < 0 || charCount < 0) - throw new ArgumentOutOfRangeException((charIndex < 0 ? nameof(charIndex) : nameof(charCount)), SR.ArgumentOutOfRange_NeedNonNegNum); - if (chars.Length - charIndex < charCount) - throw new ArgumentOutOfRangeException(nameof(chars), SR.ArgumentOutOfRange_IndexCountBuffer); + if (chars is null || bytes is null) + { + ThrowHelper.ThrowArgumentNullException( + argument: (chars is null) ? ExceptionArgument.chars : ExceptionArgument.bytes, + resource: ExceptionResource.ArgumentNull_Array); + } - if (byteIndex < 0 || byteIndex > bytes.Length) - throw new ArgumentOutOfRangeException(nameof(byteIndex), SR.ArgumentOutOfRange_Index); + if ((charIndex | charCount) < 0) + { + ThrowHelper.ThrowArgumentOutOfRangeException( + argument: (charIndex < 0) ? ExceptionArgument.charIndex : ExceptionArgument.charCount, + resource: ExceptionResource.ArgumentOutOfRange_NeedNonNegNum); + } - // If nothing to encode return 0, avoid fixed problem - if (charCount == 0) - return 0; + if (chars.Length - charIndex < charCount) + { + ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.chars, ExceptionResource.ArgumentOutOfRange_IndexCount); + } - // Just call pointer version - int byteCount = bytes.Length - byteIndex; + if ((uint)byteIndex > bytes.Length) + { + ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.byteIndex, ExceptionResource.ArgumentOutOfRange_Index); + } - fixed (char* pChars = chars) fixed (byte* pBytes = &MemoryMarshal.GetReference((Span)bytes)) - // Remember that byteCount is # to decode, not size of array. - return GetBytes(pChars + charIndex, charCount, pBytes + byteIndex, byteCount, null); + fixed (char* pChars = chars) + fixed (byte* pBytes = bytes) + { + return GetBytesCommon(pChars + charIndex, charCount, pBytes + byteIndex, bytes.Length - byteIndex); + } } // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS) @@ -264,24 +354,77 @@ namespace System.Text public override unsafe int GetBytes(char* chars, int charCount, byte* bytes, int byteCount) { // Validate Parameters - if (bytes == null || chars == null) - throw new ArgumentNullException(bytes == null ? nameof(bytes) : nameof(chars), SR.ArgumentNull_Array); - if (charCount < 0 || byteCount < 0) - throw new ArgumentOutOfRangeException((charCount < 0 ? nameof(charCount) : nameof(byteCount)), SR.ArgumentOutOfRange_NeedNonNegNum); + if (chars == null || bytes == null) + { + ThrowHelper.ThrowArgumentNullException( + argument: (chars is null) ? ExceptionArgument.chars : ExceptionArgument.bytes, + resource: ExceptionResource.ArgumentNull_Array); + } + + if ((charCount | byteCount) < 0) + { + ThrowHelper.ThrowArgumentOutOfRangeException( + argument: (charCount < 0) ? ExceptionArgument.charCount : ExceptionArgument.byteCount, + resource: ExceptionResource.ArgumentOutOfRange_NeedNonNegNum); + } - return GetBytes(chars, charCount, bytes, byteCount, null); + return GetBytesCommon(chars, charCount, bytes, byteCount); } public override unsafe int GetBytes(ReadOnlySpan chars, Span bytes) { - fixed (char* charsPtr = &MemoryMarshal.GetNonNullPinnableReference(chars)) - fixed (byte* bytesPtr = &MemoryMarshal.GetNonNullPinnableReference(bytes)) + // It's ok for us to operate on null / empty spans. + + fixed (char* charsPtr = &MemoryMarshal.GetReference(chars)) + fixed (byte* bytesPtr = &MemoryMarshal.GetReference(bytes)) { - return GetBytes(charsPtr, chars.Length, bytesPtr, bytes.Length, baseEncoder: null); + return GetBytesCommon(charsPtr, chars.Length, bytesPtr, bytes.Length); } } + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private unsafe int GetBytesCommon(char* pChars, int charCount, byte* pBytes, int byteCount) + { + // Common helper method for all non-EncoderNLS entry points to GetBytes. + // A modification of this method should be copied in to each of the supported encodings: ASCII, UTF8, UTF16, UTF32. + + Debug.Assert(charCount >= 0, "Caller should't specify negative length buffer."); + Debug.Assert(pChars != null || charCount == 0, "Input pointer shouldn't be null if non-zero length specified."); + Debug.Assert(byteCount >= 0, "Caller should't specify negative length buffer."); + Debug.Assert(pBytes != null || byteCount == 0, "Input pointer shouldn't be null if non-zero length specified."); + + // First call into the fast path. + + int bytesWritten = GetBytesFast(pChars, charCount, pBytes, byteCount, out int charsConsumed); + + if (charsConsumed == charCount) + { + // All elements converted - return immediately. + + return bytesWritten; + } + else + { + // Simple narrowing conversion couldn't operate on entire buffer - invoke fallback. + + return GetBytesWithFallback(pChars, charCount, pBytes, byteCount, charsConsumed, bytesWritten); + } + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] // called directly by GetBytesCommon + private protected sealed override unsafe int GetBytesFast(char* pChars, int charsLength, byte* pBytes, int bytesLength, out int charsConsumed) + { + // We don't care about the exact OperationStatus value returned by the workhorse routine; we only + // care if the workhorse was able to consume the entire input payload. If we're unable to do so, + // we'll handle the remainder in the fallback routine. + + Utf8Utility.TranscodeToUtf8(pChars, charsLength, pBytes, bytesLength, out char* pInputBufferRemaining, out byte* pOutputBufferRemaining); + + charsConsumed = (int)(pInputBufferRemaining - pChars); + return (int)(pOutputBufferRemaining - pBytes); + } + // Returns the number of characters produced by decoding a range of bytes // in a byte array. // @@ -293,22 +436,26 @@ namespace System.Text public override unsafe int GetCharCount(byte[] bytes, int index, int count) { // Validate Parameters - if (bytes == null) - throw new ArgumentNullException(nameof(bytes), SR.ArgumentNull_Array); - if (index < 0 || count < 0) - throw new ArgumentOutOfRangeException((index < 0 ? nameof(index) : nameof(count)), SR.ArgumentOutOfRange_NeedNonNegNum); + if (bytes is null) + { + ThrowHelper.ThrowArgumentNullException(ExceptionArgument.bytes, ExceptionResource.ArgumentNull_Array); + } - if (bytes.Length - index < count) - throw new ArgumentOutOfRangeException(nameof(bytes), SR.ArgumentOutOfRange_IndexCountBuffer); + if ((index | count) < 0) + { + ThrowHelper.ThrowArgumentOutOfRangeException((index < 0) ? ExceptionArgument.index : ExceptionArgument.count, ExceptionResource.ArgumentOutOfRange_NeedNonNegNum); + } - // If no input just return 0, fixed doesn't like 0 length arrays. - if (count == 0) - return 0; + if (bytes.Length - index < count) + { + ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.bytes, ExceptionResource.ArgumentOutOfRange_IndexCountBuffer); + } - // Just call pointer version fixed (byte* pBytes = bytes) - return GetCharCount(pBytes + index, count, null); + { + return GetCharCountCommon(pBytes + index, count); + } } // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS) @@ -319,20 +466,27 @@ namespace System.Text public override unsafe int GetCharCount(byte* bytes, int count) { // Validate Parameters + if (bytes == null) - throw new ArgumentNullException(nameof(bytes), SR.ArgumentNull_Array); + { + ThrowHelper.ThrowArgumentNullException(ExceptionArgument.bytes, ExceptionResource.ArgumentNull_Array); + } if (count < 0) - throw new ArgumentOutOfRangeException(nameof(count), SR.ArgumentOutOfRange_NeedNonNegNum); + { + ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.count, ExceptionResource.ArgumentOutOfRange_NeedNonNegNum); + } - return GetCharCount(bytes, count, null); + return GetCharCountCommon(bytes, count); } public override unsafe int GetCharCount(ReadOnlySpan bytes) { - fixed (byte* bytesPtr = &MemoryMarshal.GetNonNullPinnableReference(bytes)) + // It's ok for us to pass null pointers down to the workhorse routine. + + fixed (byte* bytesPtr = &MemoryMarshal.GetReference(bytes)) { - return GetCharCount(bytesPtr, bytes.Length, baseDecoder: null); + return GetCharCountCommon(bytesPtr, bytes.Length); } } @@ -345,28 +499,36 @@ namespace System.Text char[] chars, int charIndex) { // Validate Parameters - if (bytes == null || chars == null) - throw new ArgumentNullException(bytes == null ? nameof(bytes) : nameof(chars), SR.ArgumentNull_Array); - - if (byteIndex < 0 || byteCount < 0) - throw new ArgumentOutOfRangeException((byteIndex < 0 ? nameof(byteIndex) : nameof(byteCount)), SR.ArgumentOutOfRange_NeedNonNegNum); - if ( bytes.Length - byteIndex < byteCount) - throw new ArgumentOutOfRangeException(nameof(bytes), SR.ArgumentOutOfRange_IndexCountBuffer); + if (bytes is null || chars is null) + { + ThrowHelper.ThrowArgumentNullException( + argument: (bytes is null) ? ExceptionArgument.bytes : ExceptionArgument.chars, + resource: ExceptionResource.ArgumentNull_Array); + } - if (charIndex < 0 || charIndex > chars.Length) - throw new ArgumentOutOfRangeException(nameof(charIndex), SR.ArgumentOutOfRange_Index); + if ((byteIndex | byteCount) < 0) + { + ThrowHelper.ThrowArgumentOutOfRangeException( + argument: (byteIndex < 0) ? ExceptionArgument.byteIndex : ExceptionArgument.byteCount, + resource: ExceptionResource.ArgumentOutOfRange_NeedNonNegNum); + } - // If no input, return 0 & avoid fixed problem - if (byteCount == 0) - return 0; + if (bytes.Length - byteIndex < byteCount) + { + ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.bytes, ExceptionResource.ArgumentOutOfRange_IndexCountBuffer); + } - // Just call pointer version - int charCount = chars.Length - charIndex; + if ((uint)charIndex > (uint)chars.Length) + { + ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.charIndex, ExceptionResource.ArgumentOutOfRange_Index); + } - fixed (byte* pBytes = bytes) fixed (char* pChars = &MemoryMarshal.GetReference((Span)chars)) - // Remember that charCount is # to decode, not size of array - return GetChars(pBytes + byteIndex, byteCount, pChars + charIndex, charCount, null); + fixed (byte* pBytes = bytes) + fixed (char* pChars = chars) + { + return GetCharsCommon(pBytes + byteIndex, byteCount, pChars + charIndex, chars.Length - charIndex); + } } // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS) @@ -377,2120 +539,245 @@ namespace System.Text public unsafe override int GetChars(byte* bytes, int byteCount, char* chars, int charCount) { // Validate Parameters - if (bytes == null || chars == null) - throw new ArgumentNullException(bytes == null ? nameof(bytes) : nameof(chars), SR.ArgumentNull_Array); - - if (charCount < 0 || byteCount < 0) - throw new ArgumentOutOfRangeException((charCount < 0 ? nameof(charCount) : nameof(byteCount)), SR.ArgumentOutOfRange_NeedNonNegNum); - return GetChars(bytes, byteCount, chars, charCount, null); - } + if (bytes is null || chars is null) + { + ThrowHelper.ThrowArgumentNullException( + argument: (bytes is null) ? ExceptionArgument.bytes : ExceptionArgument.chars, + resource: ExceptionResource.ArgumentNull_Array); + } - public override unsafe int GetChars(ReadOnlySpan bytes, Span chars) - { - fixed (byte* bytesPtr = &MemoryMarshal.GetNonNullPinnableReference(bytes)) - fixed (char* charsPtr = &MemoryMarshal.GetNonNullPinnableReference(chars)) + if ((byteCount | charCount) < 0) { - return GetChars(bytesPtr, bytes.Length, charsPtr, chars.Length, baseDecoder: null); + ThrowHelper.ThrowArgumentOutOfRangeException( + argument: (byteCount < 0) ? ExceptionArgument.byteCount : ExceptionArgument.charCount, + resource: ExceptionResource.ArgumentOutOfRange_NeedNonNegNum); } - } - // Returns a string containing the decoded representation of a range of - // bytes in a byte array. - // - // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS) - // So if you fix this, fix the others. Currently those include: - // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding - // parent method is safe + return GetCharsCommon(bytes, byteCount, chars, charCount); + } - public override unsafe string GetString(byte[] bytes, int index, int count) + public override unsafe int GetChars(ReadOnlySpan bytes, Span chars) { - // Validate Parameters - if (bytes == null) - throw new ArgumentNullException(nameof(bytes), SR.ArgumentNull_Array); - - if (index < 0 || count < 0) - throw new ArgumentOutOfRangeException((index < 0 ? nameof(index) : nameof(count)), SR.ArgumentOutOfRange_NeedNonNegNum); - - if (bytes.Length - index < count) - throw new ArgumentOutOfRangeException(nameof(bytes), SR.ArgumentOutOfRange_IndexCountBuffer); - - // Avoid problems with empty input buffer - if (count == 0) return string.Empty; + // It's ok for us to pass null pointers down to the workhorse below. - fixed (byte* pBytes = bytes) - return string.CreateStringFromEncoding( - pBytes + index, count, this); + fixed (byte* bytesPtr = &MemoryMarshal.GetReference(bytes)) + fixed (char* charsPtr = &MemoryMarshal.GetReference(chars)) + { + return GetCharsCommon(bytesPtr, bytes.Length, charsPtr, chars.Length); + } } + // WARNING: If we throw an error, then System.Resources.ResourceReader calls this method. + // So if we're really broken, then that could also throw an error... recursively. + // So try to make sure GetChars can at least process all uses by + // System.Resources.ResourceReader! // - // End of standard methods copied from EncodingNLS.cs - // - - // To simplify maintenance, the structure of GetByteCount and GetBytes should be - // kept the same as much as possible - internal sealed override unsafe int GetByteCount(char* chars, int count, EncoderNLS baseEncoder) + // Note: We throw exceptions on individually encoded surrogates and other non-shortest forms. + // If exceptions aren't turned on, then we drop all non-shortest &individual surrogates. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private unsafe int GetCharsCommon(byte* pBytes, int byteCount, char* pChars, int charCount) { - // For fallback we may need a fallback buffer. - // We wait to initialize it though in case we don't have any broken input unicode - EncoderFallbackBuffer fallbackBuffer = null; - char* pSrcForFallback; + // Common helper method for all non-DecoderNLS entry points to GetChars. + // A modification of this method should be copied in to each of the supported encodings: ASCII, UTF8, UTF16, UTF32. - char* pSrc = chars; - char* pEnd = pSrc + count; + Debug.Assert(byteCount >= 0, "Caller should't specify negative length buffer."); + Debug.Assert(pBytes != null || byteCount == 0, "Input pointer shouldn't be null if non-zero length specified."); + Debug.Assert(charCount >= 0, "Caller should't specify negative length buffer."); + Debug.Assert(pChars != null || charCount == 0, "Input pointer shouldn't be null if non-zero length specified."); - // Start by assuming we have as many as count - int byteCount = count; + // First call into the fast path. - int ch = 0; + int charsWritten = GetCharsFast(pBytes, byteCount, pChars, charCount, out int bytesConsumed); - if (baseEncoder != null) + if (bytesConsumed == byteCount) { - UTF8Encoder encoder = (UTF8Encoder)baseEncoder; - ch = encoder.surrogateChar; - - // We mustn't have left over fallback data when counting - if (encoder.InternalHasFallbackBuffer) - { - fallbackBuffer = encoder.FallbackBuffer; - if (fallbackBuffer.Remaining > 0) - throw new ArgumentException(SR.Format(SR.Argument_EncoderFallbackNotEmpty, this.EncodingName, encoder.Fallback.GetType())); + // All elements converted - return immediately. - // Set our internal fallback interesting things. - fallbackBuffer.InternalInitialize(chars, pEnd, encoder, false); - } + return charsWritten; } - - for (;;) + else { - // SLOWLOOP: does all range checks, handles all special cases, but it is slow - if (pSrc >= pEnd) - { - if (ch == 0) - { - // Unroll any fallback that happens at the end - ch = fallbackBuffer != null ? fallbackBuffer.InternalGetNextChar() : 0; - if (ch > 0) - { - byteCount++; - goto ProcessChar; - } - } - else - { - // Case of surrogates in the fallback. - if (fallbackBuffer != null && fallbackBuffer.bFallingBack) - { - Debug.Assert(ch >= 0xD800 && ch <= 0xDBFF, - "[UTF8Encoding.GetBytes]expected high surrogate, not 0x" + ((int)ch).ToString("X4", CultureInfo.InvariantCulture)); - - ch = fallbackBuffer.InternalGetNextChar(); - byteCount++; - - if (InRange(ch, CharUnicodeInfo.LOW_SURROGATE_START, CharUnicodeInfo.LOW_SURROGATE_END)) - { - ch = 0xfffd; - byteCount++; - goto EncodeChar; - } - else if (ch > 0) - { - goto ProcessChar; - } - else - { - byteCount--; // ignore last one. - break; - } - } - } - - if (ch <= 0) - { - break; - } - if (baseEncoder != null && !baseEncoder.MustFlush) - { - break; - } - - // attempt to encode the partial surrogate (will fallback or ignore it), it'll also subtract 1. - byteCount++; - goto EncodeChar; - } - - if (ch > 0) - { - Debug.Assert(ch >= 0xD800 && ch <= 0xDBFF, - "[UTF8Encoding.GetBytes]expected high surrogate, not 0x" + ((int)ch).ToString("X4", CultureInfo.InvariantCulture)); - - // use separate helper variables for local contexts so that the jit optimizations - // won't get confused about the variable lifetimes - int cha = *pSrc; - - // count the pending surrogate - byteCount++; - - // In previous byte, we encountered a high surrogate, so we are expecting a low surrogate here. - // if (IsLowSurrogate(cha)) { - if (InRange(cha, CharUnicodeInfo.LOW_SURROGATE_START, CharUnicodeInfo.LOW_SURROGATE_END)) - { - // Don't need a real # because we're just counting, anything > 0x7ff ('cept surrogate) will do. - ch = 0xfffd; - // ch = cha + (ch << 10) + - // (0x10000 - // - CharUnicodeInfo.LOW_SURROGATE_START - // - (CharUnicodeInfo.HIGH_SURROGATE_START << 10) ); - - // Use this next char - pSrc++; - } - // else ch is still high surrogate and encoding will fail (so don't add count) - - // attempt to encode the surrogate or partial surrogate - goto EncodeChar; - } - - // If we've used a fallback, then we have to check for it - if (fallbackBuffer != null) - { - ch = fallbackBuffer.InternalGetNextChar(); - if (ch > 0) - { - // We have an extra byte we weren't expecting. - byteCount++; - goto ProcessChar; - } - } - - // read next char. The JIT optimization seems to be getting confused when - // compiling "ch = *pSrc++;", so rather use "ch = *pSrc; pSrc++;" instead - ch = *pSrc; - pSrc++; - - ProcessChar: - // if (IsHighSurrogate(ch)) { - if (InRange(ch, CharUnicodeInfo.HIGH_SURROGATE_START, CharUnicodeInfo.HIGH_SURROGATE_END)) - { - // we will count this surrogate next time around - byteCount--; - continue; - } - // either good char or partial surrogate - - EncodeChar: - // throw exception on partial surrogate if necessary - // if (IsLowSurrogate(ch) || IsHighSurrogate(ch)) - if (InRange(ch, CharUnicodeInfo.HIGH_SURROGATE_START, CharUnicodeInfo.LOW_SURROGATE_END)) - { - // Lone surrogates aren't allowed - // Have to make a fallback buffer if we don't have one - if (fallbackBuffer == null) - { - // wait on fallbacks if we can - // For fallback we may need a fallback buffer - if (baseEncoder == null) - fallbackBuffer = this.encoderFallback.CreateFallbackBuffer(); - else - fallbackBuffer = baseEncoder.FallbackBuffer; - - // Set our internal fallback interesting things. - fallbackBuffer.InternalInitialize(chars, chars + count, baseEncoder, false); - } - - // Do our fallback. Actually we already know its a mixed up surrogate, - // so the ref pSrc isn't gonna do anything. - pSrcForFallback = pSrc; // Avoid passing pSrc by reference to allow it to be en-registered - fallbackBuffer.InternalFallback(unchecked((char)ch), ref pSrcForFallback); - pSrc = pSrcForFallback; - - // Ignore it if we don't throw (we had preallocated this ch) - byteCount--; - ch = 0; - continue; - } - - // Count them - if (ch > 0x7F) - { - if (ch > 0x7FF) - { - // the extra surrogate byte was compensated by the second surrogate character - // (2 surrogates make 4 bytes. We've already counted 2 bytes, 1 per char) - byteCount++; - } - byteCount++; - } - -#if BIT64 - // check for overflow - if (byteCount < 0) - { - break; - } -#endif - -#if FASTLOOP - // If still have fallback don't do fast loop - if (fallbackBuffer != null && (ch = fallbackBuffer.InternalGetNextChar()) != 0) - { - // We're reserving 1 byte for each char by default - byteCount++; - goto ProcessChar; - } - - int availableChars = PtrDiff(pEnd, pSrc); + // Simple narrowing conversion couldn't operate on entire buffer - invoke fallback. - // don't fall into the fast decoding loop if we don't have enough characters - if (availableChars <= 13) - { - // try to get over the remainder of the ascii characters fast though - char* pLocalEnd = pEnd; // hint to get pLocalEnd en-registered - while (pSrc < pLocalEnd) - { - ch = *pSrc; - pSrc++; - if (ch > 0x7F) - goto ProcessChar; - } - - // we are done - break; - } - -#if BIT64 - // make sure that we won't get a silent overflow inside the fast loop - // (Fall out to slow loop if we have this many characters) - availableChars &= 0x0FFFFFFF; -#endif - - // To compute the upper bound, assume that all characters are ASCII characters at this point, - // the boundary will be decreased for every non-ASCII character we encounter - // Also, we need 3 + 4 chars reserve for the unrolled ansi decoding loop and for decoding of surrogates - char* pStop = pSrc + availableChars - (3 + 4); - - while (pSrc < pStop) - { - ch = *pSrc; - pSrc++; - - if (ch > 0x7F) // Not ASCII - { - if (ch > 0x7FF) // Not 2 Byte - { - if ((ch & 0xF800) == 0xD800) // See if its a Surrogate - goto LongCode; - byteCount++; - } - byteCount++; - } - - // get pSrc aligned - if ((unchecked((int)pSrc) & 0x2) != 0) - { - ch = *pSrc; - pSrc++; - if (ch > 0x7F) // Not ASCII - { - if (ch > 0x7FF) // Not 2 Byte - { - if ((ch & 0xF800) == 0xD800) // See if its a Surrogate - goto LongCode; - byteCount++; - } - byteCount++; - } - } - - // Run 2 * 4 characters at a time! - while (pSrc < pStop) - { - ch = *(int*)pSrc; - int chc = *(int*)(pSrc + 2); - if (((ch | chc) & unchecked((int)0xFF80FF80)) != 0) // See if not ASCII - { - if (((ch | chc) & unchecked((int)0xF800F800)) != 0) // See if not 2 Byte - { - goto LongCodeWithMask; - } - - - if ((ch & unchecked((int)0xFF800000)) != 0) // Actually 0x07800780 is all we care about (4 bits) - byteCount++; - if ((ch & unchecked((int)0xFF80)) != 0) - byteCount++; - if ((chc & unchecked((int)0xFF800000)) != 0) - byteCount++; - if ((chc & unchecked((int)0xFF80)) != 0) - byteCount++; - } - pSrc += 4; - - ch = *(int*)pSrc; - chc = *(int*)(pSrc + 2); - if (((ch | chc) & unchecked((int)0xFF80FF80)) != 0) // See if not ASCII - { - if (((ch | chc) & unchecked((int)0xF800F800)) != 0) // See if not 2 Byte - { - goto LongCodeWithMask; - } - - if ((ch & unchecked((int)0xFF800000)) != 0) - byteCount++; - if ((ch & unchecked((int)0xFF80)) != 0) - byteCount++; - if ((chc & unchecked((int)0xFF800000)) != 0) - byteCount++; - if ((chc & unchecked((int)0xFF80)) != 0) - byteCount++; - } - pSrc += 4; - } - break; - - LongCodeWithMask: - if (BitConverter.IsLittleEndian) - { - ch = (char)ch; - } - else - { - // be careful about the sign extension - ch = (int)(((uint)ch) >> 16); - } - pSrc++; - - if (ch <= 0x7F) - { - continue; - } - - LongCode: - // use separate helper variables for slow and fast loop so that the jit optimizations - // won't get confused about the variable lifetimes - if (ch > 0x7FF) - { - // if (IsLowSurrogate(ch) || IsHighSurrogate(ch)) - if (InRange(ch, CharUnicodeInfo.HIGH_SURROGATE_START, CharUnicodeInfo.LOW_SURROGATE_END)) - { - // 4 byte encoding - high surrogate + low surrogate - - int chd = *pSrc; - if ( - // !IsHighSurrogate(ch) // low without high -> bad - ch > CharUnicodeInfo.HIGH_SURROGATE_END || - // !IsLowSurrogate(chd) // high not followed by low -> bad - !InRange(chd, CharUnicodeInfo.LOW_SURROGATE_START, CharUnicodeInfo.LOW_SURROGATE_END)) - { - // Back up and drop out to slow loop to figure out error - pSrc--; - break; - } - pSrc++; - - // byteCount - this byte is compensated by the second surrogate character - } - byteCount++; - } - byteCount++; - - // byteCount - the last byte is already included - } -#endif // FASTLOOP - - // no pending char at this point - ch = 0; + return GetCharsWithFallback(pBytes, byteCount, pChars, charCount, bytesConsumed, charsWritten); } - -#if BIT64 - // check for overflow - if (byteCount < 0) - { - throw new ArgumentException( - SR.Argument_ConversionOverflow); - } -#endif - - Debug.Assert(fallbackBuffer == null || fallbackBuffer.Remaining == 0, - "[UTF8Encoding.GetByteCount]Expected Empty fallback buffer"); - - return byteCount; } - // diffs two char pointers using unsigned arithmetic. The unsigned arithmetic - // is good enough for us, and it tends to generate better code than the signed - // arithmetic generated by default - private static unsafe int PtrDiff(char* a, char* b) + [MethodImpl(MethodImplOptions.AggressiveInlining)] // called directly by GetCharsCommon + private protected sealed override unsafe int GetCharsFast(byte* pBytes, int bytesLength, char* pChars, int charsLength, out int bytesConsumed) { - return (int)(((uint)((byte*)a - (byte*)b)) >> 1); - } + // We don't care about the exact OperationStatus value returned by the workhorse routine; we only + // care if the workhorse was able to consume the entire input payload. If we're unable to do so, + // we'll handle the remainder in the fallback routine. - // byte* flavor just for parity - private static unsafe int PtrDiff(byte* a, byte* b) - { - return (int)(a - b); - } + Utf8Utility.TranscodeToUtf16(pBytes, bytesLength, pChars, charsLength, out byte* pInputBufferRemaining, out char* pOutputBufferRemaining); - private static bool InRange(int ch, int start, int end) - { - return (uint)(ch - start) <= (uint)(end - start); + bytesConsumed = (int)(pInputBufferRemaining - pBytes); + return (int)(pOutputBufferRemaining - pChars); } - // Our workhorse - // Note: We ignore mismatched surrogates, unless the exception flag is set in which case we throw - internal sealed override unsafe int GetBytes( - char* chars, int charCount, byte* bytes, int byteCount, EncoderNLS baseEncoder) + private protected sealed override unsafe int GetCharsWithFallback(ReadOnlySpan bytes, int originalBytesLength, Span chars, int originalCharsLength, DecoderNLS decoder) { - Debug.Assert(chars != null, "[UTF8Encoding.GetBytes]chars!=null"); - Debug.Assert(byteCount >= 0, "[UTF8Encoding.GetBytes]byteCount >=0"); - Debug.Assert(charCount >= 0, "[UTF8Encoding.GetBytes]charCount >=0"); - Debug.Assert(bytes != null, "[UTF8Encoding.GetBytes]bytes!=null"); - - UTF8Encoder encoder = null; - - // For fallback we may need a fallback buffer. - // We wait to initialize it though in case we don't have any broken input unicode - EncoderFallbackBuffer fallbackBuffer = null; - char* pSrcForFallback; - - char* pSrc = chars; - byte* pTarget = bytes; - - char* pEnd = pSrc + charCount; - byte* pAllocatedBufferEnd = pTarget + byteCount; - - int ch = 0; - - // assume that JIT will en-register pSrc, pTarget and ch + // We special-case DecoderReplacementFallback if it's telling us to write a single U+FFFD char, + // since we believe this to be relatively common and we can handle it more efficiently than + // the base implementation. - if (baseEncoder != null) + if (((decoder is null) ? this.DecoderFallback : decoder.Fallback) is DecoderReplacementFallback replacementFallback + && replacementFallback.MaxCharCount == 1 + && replacementFallback.DefaultString[0] == UnicodeUtility.ReplacementChar) { - encoder = (UTF8Encoder)baseEncoder; - ch = encoder.surrogateChar; - - // We mustn't have left over fallback data when counting - if (encoder.InternalHasFallbackBuffer) - { - // We always need the fallback buffer in get bytes so we can flush any remaining ones if necessary - fallbackBuffer = encoder.FallbackBuffer; - if (fallbackBuffer.Remaining > 0 && encoder._throwOnOverflow) - throw new ArgumentException(SR.Format(SR.Argument_EncoderFallbackNotEmpty, this.EncodingName, encoder.Fallback.GetType())); - - // Set our internal fallback interesting things. - fallbackBuffer.InternalInitialize(chars, pEnd, encoder, true); - } - } - - for (;;) - { - // SLOWLOOP: does all range checks, handles all special cases, but it is slow - - if (pSrc >= pEnd) - { - if (ch == 0) - { - // Check if there's anything left to get out of the fallback buffer - ch = fallbackBuffer != null ? fallbackBuffer.InternalGetNextChar() : 0; - if (ch > 0) - { - goto ProcessChar; - } - } - else - { - // Case of leftover surrogates in the fallback buffer - if (fallbackBuffer != null && fallbackBuffer.bFallingBack) - { - Debug.Assert(ch >= 0xD800 && ch <= 0xDBFF, - "[UTF8Encoding.GetBytes]expected high surrogate, not 0x" + ((int)ch).ToString("X4", CultureInfo.InvariantCulture)); - - int cha = ch; - - ch = fallbackBuffer.InternalGetNextChar(); - - if (InRange(ch, CharUnicodeInfo.LOW_SURROGATE_START, CharUnicodeInfo.LOW_SURROGATE_END)) - { - ch = ch + (cha << 10) + (0x10000 - CharUnicodeInfo.LOW_SURROGATE_START - (CharUnicodeInfo.HIGH_SURROGATE_START << 10)); - goto EncodeChar; - } - else if (ch > 0) - { - goto ProcessChar; - } - else - { - break; - } - } - } - - // attempt to encode the partial surrogate (will fail or ignore) - if (ch > 0 && (encoder == null || encoder.MustFlush)) - goto EncodeChar; - - // We're done - break; - } - - if (ch > 0) - { - // We have a high surrogate left over from a previous loop. - Debug.Assert(ch >= 0xD800 && ch <= 0xDBFF, - "[UTF8Encoding.GetBytes]expected high surrogate, not 0x" + ((int)ch).ToString("X4", CultureInfo.InvariantCulture)); - - // use separate helper variables for local contexts so that the jit optimizations - // won't get confused about the variable lifetimes - int cha = *pSrc; - - // In previous byte, we encountered a high surrogate, so we are expecting a low surrogate here. - // if (IsLowSurrogate(cha)) { - if (InRange(cha, CharUnicodeInfo.LOW_SURROGATE_START, CharUnicodeInfo.LOW_SURROGATE_END)) - { - ch = cha + (ch << 10) + - (0x10000 - - CharUnicodeInfo.LOW_SURROGATE_START - - (CharUnicodeInfo.HIGH_SURROGATE_START << 10)); - - pSrc++; - } - // else ch is still high surrogate and encoding will fail - - // attempt to encode the surrogate or partial surrogate - goto EncodeChar; - } - - // If we've used a fallback, then we have to check for it - if (fallbackBuffer != null) - { - ch = fallbackBuffer.InternalGetNextChar(); - if (ch > 0) goto ProcessChar; - } - - // read next char. The JIT optimization seems to be getting confused when - // compiling "ch = *pSrc++;", so rather use "ch = *pSrc; pSrc++;" instead - ch = *pSrc; - pSrc++; - - ProcessChar: - // if (IsHighSurrogate(ch)) { - if (InRange(ch, CharUnicodeInfo.HIGH_SURROGATE_START, CharUnicodeInfo.HIGH_SURROGATE_END)) - { - continue; - } - // either good char or partial surrogate - - EncodeChar: - // throw exception on partial surrogate if necessary - // if (IsLowSurrogate(ch) || IsHighSurrogate(ch)) - if (InRange(ch, CharUnicodeInfo.HIGH_SURROGATE_START, CharUnicodeInfo.LOW_SURROGATE_END)) - { - // Lone surrogates aren't allowed, we have to do fallback for them - // Have to make a fallback buffer if we don't have one - if (fallbackBuffer == null) - { - // wait on fallbacks if we can - // For fallback we may need a fallback buffer - if (baseEncoder == null) - fallbackBuffer = this.encoderFallback.CreateFallbackBuffer(); - else - fallbackBuffer = baseEncoder.FallbackBuffer; - - // Set our internal fallback interesting things. - fallbackBuffer.InternalInitialize(chars, pEnd, baseEncoder, true); - } - - // Do our fallback. Actually we already know its a mixed up surrogate, - // so the ref pSrc isn't gonna do anything. - pSrcForFallback = pSrc; // Avoid passing pSrc by reference to allow it to be en-registered - fallbackBuffer.InternalFallback(unchecked((char)ch), ref pSrcForFallback); - pSrc = pSrcForFallback; - - // Ignore it if we don't throw - ch = 0; - continue; - } - - // Count bytes needed - int bytesNeeded = 1; - if (ch > 0x7F) - { - if (ch > 0x7FF) - { - if (ch > 0xFFFF) - { - bytesNeeded++; // 4 bytes (surrogate pair) - } - bytesNeeded++; // 3 bytes (800-FFFF) - } - bytesNeeded++; // 2 bytes (80-7FF) - } - - if (pTarget > pAllocatedBufferEnd - bytesNeeded) - { - // Left over surrogate from last time will cause pSrc == chars, so we'll throw - if (fallbackBuffer != null && fallbackBuffer.bFallingBack) - { - fallbackBuffer.MovePrevious(); // Didn't use this fallback char - if (ch > 0xFFFF) - fallbackBuffer.MovePrevious(); // Was surrogate, didn't use 2nd part either - } - else - { - pSrc--; // Didn't use this char - if (ch > 0xFFFF) - pSrc--; // Was surrogate, didn't use 2nd part either - } - Debug.Assert(pSrc >= chars || pTarget == bytes, - "[UTF8Encoding.GetBytes]Expected pSrc to be within buffer or to throw with insufficient room."); - ThrowBytesOverflow(encoder, pTarget == bytes); // Throw if we must - ch = 0; // Nothing left over (we backed up to start of pair if supplementary) - break; - } - - if (ch <= 0x7F) - { - *pTarget = (byte)ch; - } - else - { - // use separate helper variables for local contexts so that the jit optimizations - // won't get confused about the variable lifetimes - int chb; - if (ch <= 0x7FF) - { - // 2 byte encoding - chb = (byte)(unchecked((sbyte)0xC0) | (ch >> 6)); - } - else - { - if (ch <= 0xFFFF) - { - chb = (byte)(unchecked((sbyte)0xE0) | (ch >> 12)); - } - else - { - *pTarget = (byte)(unchecked((sbyte)0xF0) | (ch >> 18)); - pTarget++; - - chb = unchecked((sbyte)0x80) | (ch >> 12) & 0x3F; - } - *pTarget = (byte)chb; - pTarget++; - - chb = unchecked((sbyte)0x80) | (ch >> 6) & 0x3F; - } - *pTarget = (byte)chb; - pTarget++; - - *pTarget = (byte)(unchecked((sbyte)0x80) | ch & 0x3F); - } - pTarget++; - - -#if FASTLOOP - // If still have fallback don't do fast loop - if (fallbackBuffer != null && (ch = fallbackBuffer.InternalGetNextChar()) != 0) - goto ProcessChar; - - int availableChars = PtrDiff(pEnd, pSrc); - int availableBytes = PtrDiff(pAllocatedBufferEnd, pTarget); - - // don't fall into the fast decoding loop if we don't have enough characters - // Note that if we don't have enough bytes, pStop will prevent us from entering the fast loop. - if (availableChars <= 13) - { - // we are hoping for 1 byte per char - if (availableBytes < availableChars) - { - // not enough output room. no pending bits at this point - ch = 0; - continue; - } - - // try to get over the remainder of the ascii characters fast though - char* pLocalEnd = pEnd; // hint to get pLocalEnd en-registered - while (pSrc < pLocalEnd) - { - ch = *pSrc; - pSrc++; - - // Not ASCII, need more than 1 byte per char - if (ch > 0x7F) - goto ProcessChar; - - *pTarget = (byte)ch; - pTarget++; - } - // we are done, let ch be 0 to clear encoder - ch = 0; - break; - } - - // we need at least 1 byte per character, but Convert might allow us to convert - // only part of the input, so try as much as we can. Reduce charCount if necessary - if (availableBytes < availableChars) - { - availableChars = availableBytes; - } - - // FASTLOOP: - // - optimistic range checks - // - fallbacks to the slow loop for all special cases, exception throwing, etc. + // Don't care about the exact OperationStatus, just how much of the payload we were able + // to process. - // To compute the upper bound, assume that all characters are ASCII characters at this point, - // the boundary will be decreased for every non-ASCII character we encounter - // Also, we need 5 chars reserve for the unrolled ansi decoding loop and for decoding of surrogates - // If there aren't enough bytes for the output, then pStop will be <= pSrc and will bypass the loop. - char* pStop = pSrc + availableChars - 5; + Utf8.ToUtf16(bytes, chars, out int bytesRead, out int charsWritten, replaceInvalidSequences: true, isFinalBlock: decoder is null || decoder.MustFlush); - while (pSrc < pStop) - { - ch = *pSrc; - pSrc++; - - if (ch > 0x7F) - { - goto LongCode; - } - *pTarget = (byte)ch; - pTarget++; - - // get pSrc aligned - if ((unchecked((int)pSrc) & 0x2) != 0) - { - ch = *pSrc; - pSrc++; - if (ch > 0x7F) - { - goto LongCode; - } - *pTarget = (byte)ch; - pTarget++; - } - - // Run 4 characters at a time! - while (pSrc < pStop) - { - ch = *(int*)pSrc; - int chc = *(int*)(pSrc + 2); - if (((ch | chc) & unchecked((int)0xFF80FF80)) != 0) - { - goto LongCodeWithMask; - } - - // Unfortunately, this is endianess sensitive - if (BitConverter.IsLittleEndian) - { - *pTarget = (byte)ch; - *(pTarget + 1) = (byte)(ch >> 16); - pSrc += 4; - *(pTarget + 2) = (byte)chc; - *(pTarget + 3) = (byte)(chc >> 16); - pTarget += 4; - } - else - { - *pTarget = (byte)(ch>>16); - *(pTarget+1) = (byte)ch; - pSrc += 4; - *(pTarget+2) = (byte)(chc>>16); - *(pTarget+3) = (byte)chc; - pTarget += 4; - } - } - continue; - - LongCodeWithMask: - if (BitConverter.IsLittleEndian) - { - ch = (char)ch; - } - else - { - // be careful about the sign extension - ch = (int)(((uint)ch) >> 16); - } - pSrc++; - - if (ch > 0x7F) - { - goto LongCode; - } - *pTarget = (byte)ch; - pTarget++; - continue; - - LongCode: - // use separate helper variables for slow and fast loop so that the jit optimizations - // won't get confused about the variable lifetimes - int chd; - if (ch <= 0x7FF) - { - // 2 byte encoding - chd = unchecked((sbyte)0xC0) | (ch >> 6); - } - else - { - // if (!IsLowSurrogate(ch) && !IsHighSurrogate(ch)) - if (!InRange(ch, CharUnicodeInfo.HIGH_SURROGATE_START, CharUnicodeInfo.LOW_SURROGATE_END)) - { - // 3 byte encoding - chd = unchecked((sbyte)0xE0) | (ch >> 12); - } - else - { - // 4 byte encoding - high surrogate + low surrogate - // if (!IsHighSurrogate(ch)) - if (ch > CharUnicodeInfo.HIGH_SURROGATE_END) - { - // low without high -> bad, try again in slow loop - pSrc -= 1; - break; - } - - chd = *pSrc; - pSrc++; - - // if (!IsLowSurrogate(chd)) { - if (!InRange(chd, CharUnicodeInfo.LOW_SURROGATE_START, CharUnicodeInfo.LOW_SURROGATE_END)) - { - // high not followed by low -> bad, try again in slow loop - pSrc -= 2; - break; - } - - ch = chd + (ch << 10) + - (0x10000 - - CharUnicodeInfo.LOW_SURROGATE_START - - (CharUnicodeInfo.HIGH_SURROGATE_START << 10)); - - *pTarget = (byte)(unchecked((sbyte)0xF0) | (ch >> 18)); - // pStop - this byte is compensated by the second surrogate character - // 2 input chars require 4 output bytes. 2 have been anticipated already - // and 2 more will be accounted for by the 2 pStop-- calls below. - pTarget++; - - chd = unchecked((sbyte)0x80) | (ch >> 12) & 0x3F; - } - *pTarget = (byte)chd; - pStop--; // 3 byte sequence for 1 char, so need pStop-- and the one below too. - pTarget++; - - chd = unchecked((sbyte)0x80) | (ch >> 6) & 0x3F; - } - *pTarget = (byte)chd; - pStop--; // 2 byte sequence for 1 char so need pStop--. - pTarget++; - - *pTarget = (byte)(unchecked((sbyte)0x80) | ch & 0x3F); - // pStop - this byte is already included - pTarget++; - } + // Slice off how much we consumed / wrote. - Debug.Assert(pTarget <= pAllocatedBufferEnd, "[UTF8Encoding.GetBytes]pTarget <= pAllocatedBufferEnd"); + bytes = bytes.Slice(bytesRead); + chars = chars.Slice(charsWritten); + } -#endif // FASTLOOP + // If we couldn't go through our fast fallback mechanism, or if we still have leftover + // data because we couldn't consume everything in the loop above, we need to go down the + // slow fallback path. - // no pending char at this point - ch = 0; + if (bytes.IsEmpty) + { + return originalCharsLength - chars.Length; // total number of chars written } - - // Do we have to set the encoder bytes? - if (encoder != null) + else { - Debug.Assert(!encoder.MustFlush || ch == 0, - "[UTF8Encoding.GetBytes] Expected no mustflush or 0 leftover ch " + ch.ToString("X2", CultureInfo.InvariantCulture)); - - encoder.surrogateChar = ch; - encoder._charsUsed = (int)(pSrc - chars); + return base.GetCharsWithFallback(bytes, originalBytesLength, chars, originalCharsLength, decoder); } - - Debug.Assert(fallbackBuffer == null || fallbackBuffer.Remaining == 0 || - baseEncoder == null || !baseEncoder._throwOnOverflow, - "[UTF8Encoding.GetBytes]Expected empty fallback buffer if not converting"); - - return (int)(pTarget - bytes); } - - // These are bitmasks used to maintain the state in the decoder. They occupy the higher bits - // while the actual character is being built in the lower bits. They are shifted together - // with the actual bits of the character. - - // bits 30 & 31 are used for pending bits fixup - private const int FinalByte = 1 << 29; - private const int SupplimentarySeq = 1 << 28; - private const int ThreeByteSeq = 1 << 27; - - // Note: We throw exceptions on individually encoded surrogates and other non-shortest forms. - // If exceptions aren't turned on, then we drop all non-shortest &individual surrogates. + // Returns a string containing the decoded representation of a range of + // bytes in a byte array. // - // To simplify maintenance, the structure of GetCharCount and GetChars should be - // kept the same as much as possible - internal sealed override unsafe int GetCharCount(byte* bytes, int count, DecoderNLS baseDecoder) - { - Debug.Assert(count >= 0, "[UTF8Encoding.GetCharCount]count >=0"); - Debug.Assert(bytes != null, "[UTF8Encoding.GetCharCount]bytes!=null"); - - // Initialize stuff - byte* pSrc = bytes; - byte* pEnd = pSrc + count; + // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS) + // So if you fix this, fix the others. Currently those include: + // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding + // parent method is safe - // Start by assuming we have as many as count, charCount always includes the adjustment - // for the character being decoded - int charCount = count; - int ch = 0; - DecoderFallbackBuffer fallback = null; + public override unsafe string GetString(byte[] bytes, int index, int count) + { + // Validate Parameters - if (baseDecoder != null) + if (bytes is null) { - UTF8Decoder decoder = (UTF8Decoder)baseDecoder; - ch = decoder.bits; - charCount -= (ch >> 30); // Adjust char count for # of expected bytes and expected output chars. - - // Shouldn't have anything in fallback buffer for GetCharCount - // (don't have to check _throwOnOverflow for count) - Debug.Assert(!decoder.InternalHasFallbackBuffer || decoder.FallbackBuffer.Remaining == 0, - "[UTF8Encoding.GetCharCount]Expected empty fallback buffer at start"); + ThrowHelper.ThrowArgumentNullException(ExceptionArgument.bytes, ExceptionResource.ArgumentNull_Array); } - for (;;) + if ((index | count) < 0) { - // SLOWLOOP: does all range checks, handles all special cases, but it is slow - - if (pSrc >= pEnd) - { - break; - } - - if (ch == 0) - { - // no pending bits - goto ReadChar; - } - - // read next byte. The JIT optimization seems to be getting confused when - // compiling "ch = *pSrc++;", so rather use "ch = *pSrc; pSrc++;" instead - int cha = *pSrc; - pSrc++; - - // we are expecting to see trailing bytes like 10vvvvvv - if ((cha & unchecked((sbyte)0xC0)) != 0x80) - { - // This can be a valid starting byte for another UTF8 byte sequence, so let's put - // the current byte back, and try to see if this is a valid byte for another UTF8 byte sequence - pSrc--; - charCount += (ch >> 30); - goto InvalidByteSequence; - } - - // fold in the new byte - ch = (ch << 6) | (cha & 0x3F); - - if ((ch & FinalByte) == 0) - { - Debug.Assert((ch & (SupplimentarySeq | ThreeByteSeq)) != 0, - "[UTF8Encoding.GetChars]Invariant volation"); - - if ((ch & SupplimentarySeq) != 0) - { - if ((ch & (FinalByte >> 6)) != 0) - { - // this is 3rd byte (of 4 byte supplementary) - nothing to do - continue; - } - - // 2nd byte, check for non-shortest form of supplementary char and the valid - // supplementary characters in range 0x010000 - 0x10FFFF at the same time - if (!InRange(ch & 0x1F0, 0x10, 0x100)) - { - goto InvalidByteSequence; - } - } - else - { - // Must be 2nd byte of a 3-byte sequence - // check for non-shortest form of 3 byte seq - if ((ch & (0x1F << 5)) == 0 || // non-shortest form - (ch & (0xF800 >> 6)) == (0xD800 >> 6)) // illegal individually encoded surrogate - { - goto InvalidByteSequence; - } - } - continue; - } - - // ready to punch - - // adjust for surrogates in non-shortest form - if ((ch & (SupplimentarySeq | 0x1F0000)) == SupplimentarySeq) - { - charCount--; - } - goto EncodeChar; - - InvalidByteSequence: - // this code fragment should be close to the goto referencing it - // Have to do fallback for invalid bytes - if (fallback == null) - { - if (baseDecoder == null) - fallback = this.decoderFallback.CreateFallbackBuffer(); - else - fallback = baseDecoder.FallbackBuffer; - fallback.InternalInitialize(bytes, null); - } - charCount += FallbackInvalidByteSequence(pSrc, ch, fallback); - - ch = 0; - continue; - - ReadChar: - ch = *pSrc; - pSrc++; - - ProcessChar: - if (ch > 0x7F) - { - // If its > 0x7F, its start of a new multi-byte sequence - - // Long sequence, so unreserve our char. - charCount--; - - // bit 6 has to be non-zero for start of multibyte chars. - if ((ch & 0x40) == 0) - { - // Unexpected trail byte - goto InvalidByteSequence; - } - - // start a new long code - if ((ch & 0x20) != 0) - { - if ((ch & 0x10) != 0) - { - // 4 byte encoding - supplimentary character (2 surrogates) - - ch &= 0x0F; - - // check that bit 4 is zero and the valid supplimentary character - // range 0x000000 - 0x10FFFF at the same time - if (ch > 0x04) - { - ch |= 0xf0; - goto InvalidByteSequence; - } - - // Add bit flags so that when we check new characters & rotate we'll be flagged correctly. - // Final byte flag, count fix if we don't make final byte & supplimentary sequence flag. - ch |= (FinalByte >> 3 * 6) | // Final byte is 3 more bytes from now - (1 << 30) | // If it dies on next byte we'll need an extra char - (3 << (30 - 2 * 6)) | // If it dies on last byte we'll need to subtract a char - (SupplimentarySeq) | (SupplimentarySeq >> 6) | - (SupplimentarySeq >> 2 * 6) | (SupplimentarySeq >> 3 * 6); - - // Our character count will be 2 characters for these 4 bytes, so subtract another char - charCount--; - } - else - { - // 3 byte encoding - // Add bit flags so that when we check new characters & rotate we'll be flagged correctly. - ch = (ch & 0x0F) | ((FinalByte >> 2 * 6) | (1 << 30) | - (ThreeByteSeq) | (ThreeByteSeq >> 6) | (ThreeByteSeq >> 2 * 6)); - - // We'll expect 1 character for these 3 bytes, so subtract another char. - charCount--; - } - } - else - { - // 2 byte encoding - - ch &= 0x1F; - - // check for non-shortest form - if (ch <= 1) - { - ch |= 0xc0; - goto InvalidByteSequence; - } - - // Add bit flags so we'll be flagged correctly - ch |= (FinalByte >> 6); - } - continue; - } - - EncodeChar: - -#if FASTLOOP - int availableBytes = PtrDiff(pEnd, pSrc); - - // don't fall into the fast decoding loop if we don't have enough bytes - if (availableBytes <= 13) - { - // try to get over the remainder of the ascii characters fast though - byte* pLocalEnd = pEnd; // hint to get pLocalEnd en-registered - while (pSrc < pLocalEnd) - { - ch = *pSrc; - pSrc++; - - if (ch > 0x7F) - goto ProcessChar; - } - // we are done - ch = 0; - break; - } - - // To compute the upper bound, assume that all characters are ASCII characters at this point, - // the boundary will be decreased for every non-ASCII character we encounter - // Also, we need 7 chars reserve for the unrolled ansi decoding loop and for decoding of multibyte sequences - byte* pStop = pSrc + availableBytes - 7; - - while (pSrc < pStop) - { - ch = *pSrc; - pSrc++; - - if (ch > 0x7F) - { - goto LongCode; - } - - // get pSrc 2-byte aligned - if ((unchecked((int)pSrc) & 0x1) != 0) - { - ch = *pSrc; - pSrc++; - if (ch > 0x7F) - { - goto LongCode; - } - } - - // get pSrc 4-byte aligned - if ((unchecked((int)pSrc) & 0x2) != 0) - { - ch = *(ushort*)pSrc; - if ((ch & 0x8080) != 0) - { - goto LongCodeWithMask16; - } - pSrc += 2; - } - - // Run 8 + 8 characters at a time! - while (pSrc < pStop) - { - ch = *(int*)pSrc; - int chb = *(int*)(pSrc + 4); - if (((ch | chb) & unchecked((int)0x80808080)) != 0) - { - goto LongCodeWithMask32; - } - pSrc += 8; - - // This is a really small loop - unroll it - if (pSrc >= pStop) - break; - - ch = *(int*)pSrc; - chb = *(int*)(pSrc + 4); - if (((ch | chb) & unchecked((int)0x80808080)) != 0) - { - goto LongCodeWithMask32; - } - pSrc += 8; - } - break; - - LongCodeWithMask32: - if (BitConverter.IsLittleEndian) - { - ch &= 0xFF; - } - else - { - // be careful about the sign extension - ch = (int)(((uint)ch) >> 16); - } - LongCodeWithMask16: - if (BitConverter.IsLittleEndian) - { - ch &= 0xFF; - } - else - { - ch = (int)(((uint)ch) >> 8); - } - - pSrc++; - if (ch <= 0x7F) - { - continue; - } - - LongCode: - int chc = *pSrc; - pSrc++; - - if ( - // bit 6 has to be zero - (ch & 0x40) == 0 || - // we are expecting to see trailing bytes like 10vvvvvv - (chc & unchecked((sbyte)0xC0)) != 0x80) - { - goto BadLongCode; - } - - chc &= 0x3F; - - // start a new long code - if ((ch & 0x20) != 0) - { - // fold the first two bytes together - chc |= (ch & 0x0F) << 6; - - if ((ch & 0x10) != 0) - { - // 4 byte encoding - surrogate - ch = *pSrc; - if ( - // check that bit 4 is zero, the non-shortest form of surrogate - // and the valid surrogate range 0x000000 - 0x10FFFF at the same time - !InRange(chc >> 4, 0x01, 0x10) || - // we are expecting to see trailing bytes like 10vvvvvv - (ch & unchecked((sbyte)0xC0)) != 0x80) - { - goto BadLongCode; - } - - chc = (chc << 6) | (ch & 0x3F); - - ch = *(pSrc + 1); - // we are expecting to see trailing bytes like 10vvvvvv - if ((ch & unchecked((sbyte)0xC0)) != 0x80) - { - goto BadLongCode; - } - pSrc += 2; - - // extra byte - charCount--; - } - else - { - // 3 byte encoding - ch = *pSrc; - if ( - // check for non-shortest form of 3 byte seq - (chc & (0x1F << 5)) == 0 || - // Can't have surrogates here. - (chc & (0xF800 >> 6)) == (0xD800 >> 6) || - // we are expecting to see trailing bytes like 10vvvvvv - (ch & unchecked((sbyte)0xC0)) != 0x80) - { - goto BadLongCode; - } - pSrc++; - - // extra byte - charCount--; - } - } - else - { - // 2 byte encoding - - // check for non-shortest form - if ((ch & 0x1E) == 0) - { - goto BadLongCode; - } - } - - // extra byte - charCount--; - } -#endif // FASTLOOP - - // no pending bits at this point - ch = 0; - continue; - - BadLongCode: - pSrc -= 2; - ch = 0; - continue; + ThrowHelper.ThrowArgumentOutOfRangeException( + argument: (index < 0) ? ExceptionArgument.index : ExceptionArgument.count, + resource: ExceptionResource.ArgumentOutOfRange_NeedNonNegNum); } - // May have a problem if we have to flush - if (ch != 0) + if (bytes.Length - index < count) { - // We were already adjusting for these, so need to un-adjust - charCount += (ch >> 30); - if (baseDecoder == null || baseDecoder.MustFlush) - { - // Have to do fallback for invalid bytes - if (fallback == null) - { - if (baseDecoder == null) - fallback = this.decoderFallback.CreateFallbackBuffer(); - else - fallback = baseDecoder.FallbackBuffer; - fallback.InternalInitialize(bytes, null); - } - charCount += FallbackInvalidByteSequence(pSrc, ch, fallback); - } + ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.bytes, ExceptionResource.ArgumentOutOfRange_IndexCountBuffer); } - // Shouldn't have anything in fallback buffer for GetCharCount - // (don't have to check _throwOnOverflow for count) - Debug.Assert(fallback == null || fallback.Remaining == 0, - "[UTF8Encoding.GetCharCount]Expected empty fallback buffer at end"); + // Avoid problems with empty input buffer + if (count == 0) + return string.Empty; - return charCount; + fixed (byte* pBytes = bytes) + { + return string.CreateStringFromEncoding(pBytes + index, count, this); + } } - // WARNING: If we throw an error, then System.Resources.ResourceReader calls this method. - // So if we're really broken, then that could also throw an error... recursively. - // So try to make sure GetChars can at least process all uses by - // System.Resources.ResourceReader! // - // Note: We throw exceptions on individually encoded surrogates and other non-shortest forms. - // If exceptions aren't turned on, then we drop all non-shortest &individual surrogates. + // End of standard methods copied from EncodingNLS.cs // - // To simplify maintenance, the structure of GetCharCount and GetChars should be - // kept the same as much as possible - internal sealed override unsafe int GetChars( - byte* bytes, int byteCount, char* chars, int charCount, DecoderNLS baseDecoder) - { - Debug.Assert(chars != null, "[UTF8Encoding.GetChars]chars!=null"); - Debug.Assert(byteCount >= 0, "[UTF8Encoding.GetChars]count >=0"); - Debug.Assert(charCount >= 0, "[UTF8Encoding.GetChars]charCount >=0"); - Debug.Assert(bytes != null, "[UTF8Encoding.GetChars]bytes!=null"); - - byte* pSrc = bytes; - char* pTarget = chars; - - byte* pEnd = pSrc + byteCount; - char* pAllocatedBufferEnd = pTarget + charCount; - - int ch = 0; - - DecoderFallbackBuffer fallback = null; - byte* pSrcForFallback; - char* pTargetForFallback; - if (baseDecoder != null) - { - UTF8Decoder decoder = (UTF8Decoder)baseDecoder; - ch = decoder.bits; - - // Shouldn't have anything in fallback buffer for GetChars - // (don't have to check _throwOnOverflow for chars, we always use all or none so always should be empty) - Debug.Assert(!decoder.InternalHasFallbackBuffer || decoder.FallbackBuffer.Remaining == 0, - "[UTF8Encoding.GetChars]Expected empty fallback buffer at start"); - } - for (;;) - { - // SLOWLOOP: does all range checks, handles all special cases, but it is slow - - if (pSrc >= pEnd) - { - break; - } - - if (ch == 0) - { - // no pending bits - goto ReadChar; - } - - // read next byte. The JIT optimization seems to be getting confused when - // compiling "ch = *pSrc++;", so rather use "ch = *pSrc; pSrc++;" instead - int cha = *pSrc; - pSrc++; - - // we are expecting to see trailing bytes like 10vvvvvv - if ((cha & unchecked((sbyte)0xC0)) != 0x80) - { - // This can be a valid starting byte for another UTF8 byte sequence, so let's put - // the current byte back, and try to see if this is a valid byte for another UTF8 byte sequence - pSrc--; - goto InvalidByteSequence; - } - - // fold in the new byte - ch = (ch << 6) | (cha & 0x3F); - - if ((ch & FinalByte) == 0) - { - // Not at last byte yet - Debug.Assert((ch & (SupplimentarySeq | ThreeByteSeq)) != 0, - "[UTF8Encoding.GetChars]Invariant volation"); - - if ((ch & SupplimentarySeq) != 0) - { - // Its a 4-byte supplimentary sequence - if ((ch & (FinalByte >> 6)) != 0) - { - // this is 3rd byte of 4 byte sequence - nothing to do - continue; - } - - // 2nd byte of 4 bytes - // check for non-shortest form of surrogate and the valid surrogate - // range 0x000000 - 0x10FFFF at the same time - if (!InRange(ch & 0x1F0, 0x10, 0x100)) - { - goto InvalidByteSequence; - } - } - else - { - // Must be 2nd byte of a 3-byte sequence - // check for non-shortest form of 3 byte seq - if ((ch & (0x1F << 5)) == 0 || // non-shortest form - (ch & (0xF800 >> 6)) == (0xD800 >> 6)) // illegal individually encoded surrogate - { - goto InvalidByteSequence; - } - } - continue; - } - - // ready to punch - - // surrogate in shortest form? - // Might be possible to get rid of this? Already did non-shortest check for 4-byte sequence when reading 2nd byte? - if ((ch & (SupplimentarySeq | 0x1F0000)) > SupplimentarySeq) - { - // let the range check for the second char throw the exception - if (pTarget < pAllocatedBufferEnd) - { - *pTarget = (char)(((ch >> 10) & 0x7FF) + - unchecked((short)((CharUnicodeInfo.HIGH_SURROGATE_START - (0x10000 >> 10))))); - pTarget++; - - ch = (ch & 0x3FF) + - unchecked((int)(CharUnicodeInfo.LOW_SURROGATE_START)); - } - } - - goto EncodeChar; - - InvalidByteSequence: - // this code fragment should be close to the gotos referencing it - // Have to do fallback for invalid bytes - if (fallback == null) - { - if (baseDecoder == null) - fallback = this.decoderFallback.CreateFallbackBuffer(); - else - fallback = baseDecoder.FallbackBuffer; - fallback.InternalInitialize(bytes, pAllocatedBufferEnd); - } - // That'll back us up the appropriate # of bytes if we didn't get anywhere - pSrcForFallback = pSrc; // Avoid passing pSrc by reference to allow it to be en-registered - pTargetForFallback = pTarget; // Avoid passing pTarget by reference to allow it to be en-registered - bool fallbackResult = FallbackInvalidByteSequence(ref pSrcForFallback, ch, fallback, ref pTargetForFallback); - pSrc = pSrcForFallback; - pTarget = pTargetForFallback; - - if (!fallbackResult) - { - // Ran out of buffer space - // Need to throw an exception? - Debug.Assert(pSrc >= bytes || pTarget == chars, - "[UTF8Encoding.GetChars]Expected to throw or remain in byte buffer after fallback"); - fallback.InternalReset(); - ThrowCharsOverflow(baseDecoder, pTarget == chars); - ch = 0; - break; - } - Debug.Assert(pSrc >= bytes, - "[UTF8Encoding.GetChars]Expected invalid byte sequence to have remained within the byte array"); - ch = 0; - continue; - - ReadChar: - ch = *pSrc; - pSrc++; - - ProcessChar: - if (ch > 0x7F) - { - // If its > 0x7F, its start of a new multi-byte sequence - - // bit 6 has to be non-zero - if ((ch & 0x40) == 0) - { - goto InvalidByteSequence; - } - - // start a new long code - if ((ch & 0x20) != 0) - { - if ((ch & 0x10) != 0) - { - // 4 byte encoding - supplimentary character (2 surrogates) - - ch &= 0x0F; - - // check that bit 4 is zero and the valid supplimentary character - // range 0x000000 - 0x10FFFF at the same time - if (ch > 0x04) - { - ch |= 0xf0; - goto InvalidByteSequence; - } - - ch |= (FinalByte >> 3 * 6) | (1 << 30) | (3 << (30 - 2 * 6)) | - (SupplimentarySeq) | (SupplimentarySeq >> 6) | - (SupplimentarySeq >> 2 * 6) | (SupplimentarySeq >> 3 * 6); - } - else - { - // 3 byte encoding - ch = (ch & 0x0F) | ((FinalByte >> 2 * 6) | (1 << 30) | - (ThreeByteSeq) | (ThreeByteSeq >> 6) | (ThreeByteSeq >> 2 * 6)); - } - } - else - { - // 2 byte encoding - - ch &= 0x1F; - - // check for non-shortest form - if (ch <= 1) - { - ch |= 0xc0; - goto InvalidByteSequence; - } - - ch |= (FinalByte >> 6); - } - continue; - } - - EncodeChar: - // write the pending character - if (pTarget >= pAllocatedBufferEnd) - { - // Fix chars so we make sure to throw if we didn't output anything - ch &= 0x1fffff; - if (ch > 0x7f) - { - if (ch > 0x7ff) - { - if (ch >= CharUnicodeInfo.LOW_SURROGATE_START && - ch <= CharUnicodeInfo.LOW_SURROGATE_END) - { - pSrc--; // It was 4 bytes - pTarget--; // 1 was stored already, but we can't remember 1/2, so back up - } - else if (ch > 0xffff) - { - pSrc--; // It was 4 bytes, nothing was stored - } - pSrc--; // It was at least 3 bytes - } - pSrc--; // It was at least 2 bytes - } - pSrc--; - - // Throw that we don't have enough room (pSrc could be < chars if we had started to process - // a 4 byte sequence already) - Debug.Assert(pSrc >= bytes || pTarget == chars, - "[UTF8Encoding.GetChars]Expected pSrc to be within input buffer or throw due to no output]"); - ThrowCharsOverflow(baseDecoder, pTarget == chars); - - // Don't store ch in decoder, we already backed up to its start - ch = 0; - - // Didn't throw, just use this buffer size. - break; - } - *pTarget = (char)ch; - pTarget++; - -#if FASTLOOP - int availableChars = PtrDiff(pAllocatedBufferEnd, pTarget); - int availableBytes = PtrDiff(pEnd, pSrc); - - // don't fall into the fast decoding loop if we don't have enough bytes - // Test for availableChars is done because pStop would be <= pTarget. - if (availableBytes <= 13) - { - // we may need as many as 1 character per byte - if (availableChars < availableBytes) - { - // not enough output room. no pending bits at this point - ch = 0; - continue; - } - - // try to get over the remainder of the ascii characters fast though - byte* pLocalEnd = pEnd; // hint to get pLocalEnd enregistered - while (pSrc < pLocalEnd) - { - ch = *pSrc; - pSrc++; - - if (ch > 0x7F) - goto ProcessChar; - - *pTarget = (char)ch; - pTarget++; - } - // we are done - ch = 0; - break; - } - - // we may need as many as 1 character per byte, so reduce the byte count if necessary. - // If availableChars is too small, pStop will be before pTarget and we won't do fast loop. - if (availableChars < availableBytes) - { - availableBytes = availableChars; - } - - // To compute the upper bound, assume that all characters are ASCII characters at this point, - // the boundary will be decreased for every non-ASCII character we encounter - // Also, we need 7 chars reserve for the unrolled ansi decoding loop and for decoding of multibyte sequences - char* pStop = pTarget + availableBytes - 7; - - while (pTarget < pStop) - { - ch = *pSrc; - pSrc++; - - if (ch > 0x7F) - { - goto LongCode; - } - *pTarget = (char)ch; - pTarget++; - - // get pSrc to be 2-byte aligned - if ((unchecked((int)pSrc) & 0x1) != 0) - { - ch = *pSrc; - pSrc++; - if (ch > 0x7F) - { - goto LongCode; - } - *pTarget = (char)ch; - pTarget++; - } - - // get pSrc to be 4-byte aligned - if ((unchecked((int)pSrc) & 0x2) != 0) - { - ch = *(ushort*)pSrc; - if ((ch & 0x8080) != 0) - { - goto LongCodeWithMask16; - } - - // Unfortunately, this is endianess sensitive - if (BitConverter.IsLittleEndian) - { - *pTarget = (char)(ch & 0x7F); - pSrc += 2; - *(pTarget + 1) = (char)((ch >> 8) & 0x7F); - pTarget += 2; - } - else - { - *pTarget = (char)((ch >> 8) & 0x7F); - pSrc += 2; - *(pTarget+1) = (char)(ch & 0x7F); - pTarget += 2; - } - } - - // Run 8 characters at a time! - while (pTarget < pStop) - { - ch = *(int*)pSrc; - int chb = *(int*)(pSrc + 4); - if (((ch | chb) & unchecked((int)0x80808080)) != 0) - { - goto LongCodeWithMask32; - } - - // Unfortunately, this is endianess sensitive - if (BitConverter.IsLittleEndian) - { - *pTarget = (char)(ch & 0x7F); - *(pTarget + 1) = (char)((ch >> 8) & 0x7F); - *(pTarget + 2) = (char)((ch >> 16) & 0x7F); - *(pTarget + 3) = (char)((ch >> 24) & 0x7F); - pSrc += 8; - *(pTarget + 4) = (char)(chb & 0x7F); - *(pTarget + 5) = (char)((chb >> 8) & 0x7F); - *(pTarget + 6) = (char)((chb >> 16) & 0x7F); - *(pTarget + 7) = (char)((chb >> 24) & 0x7F); - pTarget += 8; - } - else - { - *pTarget = (char)((ch >> 24) & 0x7F); - *(pTarget+1) = (char)((ch >> 16) & 0x7F); - *(pTarget+2) = (char)((ch >> 8) & 0x7F); - *(pTarget+3) = (char)(ch & 0x7F); - pSrc += 8; - *(pTarget+4) = (char)((chb >> 24) & 0x7F); - *(pTarget+5) = (char)((chb >> 16) & 0x7F); - *(pTarget+6) = (char)((chb >> 8) & 0x7F); - *(pTarget+7) = (char)(chb & 0x7F); - pTarget += 8; - } - } - break; - - LongCodeWithMask32: - if (BitConverter.IsLittleEndian) - { - ch &= 0xFF; - } - else - { - // be careful about the sign extension - ch = (int)(((uint)ch) >> 16); - } - LongCodeWithMask16: - if (BitConverter.IsLittleEndian) - { - ch &= 0xFF; - } - else - { - ch = (int)(((uint)ch) >> 8); - } - pSrc++; - if (ch <= 0x7F) - { - *pTarget = (char)ch; - pTarget++; - continue; - } - - LongCode: - int chc = *pSrc; - pSrc++; - - if ( - // bit 6 has to be zero - (ch & 0x40) == 0 || - // we are expecting to see trailing bytes like 10vvvvvv - (chc & unchecked((sbyte)0xC0)) != 0x80) - { - goto BadLongCode; - } - - chc &= 0x3F; - - // start a new long code - if ((ch & 0x20) != 0) - { - // fold the first two bytes together - chc |= (ch & 0x0F) << 6; - - if ((ch & 0x10) != 0) - { - // 4 byte encoding - surrogate - ch = *pSrc; - if ( - // check that bit 4 is zero, the non-shortest form of surrogate - // and the valid surrogate range 0x000000 - 0x10FFFF at the same time - !InRange(chc >> 4, 0x01, 0x10) || - // we are expecting to see trailing bytes like 10vvvvvv - (ch & unchecked((sbyte)0xC0)) != 0x80) - { - goto BadLongCode; - } - - chc = (chc << 6) | (ch & 0x3F); - - ch = *(pSrc + 1); - // we are expecting to see trailing bytes like 10vvvvvv - if ((ch & unchecked((sbyte)0xC0)) != 0x80) - { - goto BadLongCode; - } - pSrc += 2; - - ch = (chc << 6) | (ch & 0x3F); - - *pTarget = (char)(((ch >> 10) & 0x7FF) + - unchecked((short)(CharUnicodeInfo.HIGH_SURROGATE_START - (0x10000 >> 10)))); - pTarget++; - - ch = (ch & 0x3FF) + - unchecked((short)(CharUnicodeInfo.LOW_SURROGATE_START)); - - // extra byte, we're already planning 2 chars for 2 of these bytes, - // but the big loop is testing the target against pStop, so we need - // to subtract 2 more or we risk overrunning the input. Subtract - // one here and one below. - pStop--; - } - else - { - // 3 byte encoding - ch = *pSrc; - if ( - // check for non-shortest form of 3 byte seq - (chc & (0x1F << 5)) == 0 || - // Can't have surrogates here. - (chc & (0xF800 >> 6)) == (0xD800 >> 6) || - // we are expecting to see trailing bytes like 10vvvvvv - (ch & unchecked((sbyte)0xC0)) != 0x80) - { - goto BadLongCode; - } - pSrc++; - - ch = (chc << 6) | (ch & 0x3F); - - // extra byte, we're only expecting 1 char for each of these 3 bytes, - // but the loop is testing the target (not source) against pStop, so - // we need to subtract 2 more or we risk overrunning the input. - // Subtract 1 here and one more below - pStop--; - } - } - else - { - // 2 byte encoding - - ch &= 0x1F; - - // check for non-shortest form - if (ch <= 1) - { - goto BadLongCode; - } - ch = (ch << 6) | chc; - } - - *pTarget = (char)ch; - pTarget++; - - // extra byte, we're only expecting 1 char for each of these 2 bytes, - // but the loop is testing the target (not source) against pStop. - // subtract an extra count from pStop so that we don't overrun the input. - pStop--; - } -#endif // FASTLOOP + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private unsafe int GetCharCountCommon(byte* pBytes, int byteCount) + { + // Common helper method for all non-DecoderNLS entry points to GetCharCount. + // A modification of this method should be copied in to each of the supported encodings: ASCII, UTF8, UTF16, UTF32. - Debug.Assert(pTarget <= pAllocatedBufferEnd, "[UTF8Encoding.GetChars]pTarget <= pAllocatedBufferEnd"); + Debug.Assert(byteCount >= 0, "Caller should't specify negative length buffer."); + Debug.Assert(pBytes != null || byteCount == 0, "Input pointer shouldn't be null if non-zero length specified."); - // no pending bits at this point - ch = 0; - continue; + // First call into the fast path. + // Don't bother providing a fallback mechanism; our fast path doesn't use it. - BadLongCode: - pSrc -= 2; - ch = 0; - continue; - } + int totalCharCount = GetCharCountFast(pBytes, byteCount, fallback: null, out int bytesConsumed); - if (ch != 0 && (baseDecoder == null || baseDecoder.MustFlush)) + if (bytesConsumed != byteCount) { - // Have to do fallback for invalid bytes - if (fallback == null) - { - if (baseDecoder == null) - fallback = this.decoderFallback.CreateFallbackBuffer(); - else - fallback = baseDecoder.FallbackBuffer; - fallback.InternalInitialize(bytes, pAllocatedBufferEnd); - } - - // That'll back us up the appropriate # of bytes if we didn't get anywhere - pSrcForFallback = pSrc; // Avoid passing pSrc by reference to allow it to be en-registered - pTargetForFallback = pTarget; // Avoid passing pTarget by reference to allow it to be en-registered - bool fallbackResult = FallbackInvalidByteSequence(ref pSrcForFallback, ch, fallback, ref pTargetForFallback); - pSrc = pSrcForFallback; - pTarget = pTargetForFallback; + // If there's still data remaining in the source buffer, go down the fallback path. + // We need to check for integer overflow since the fallback could change the required + // output count in unexpected ways. - if (!fallbackResult) + totalCharCount += GetCharCountWithFallback(pBytes, byteCount, bytesConsumed); + if (totalCharCount < 0) { - Debug.Assert(pSrc >= bytes || pTarget == chars, - "[UTF8Encoding.GetChars]Expected to throw or remain in byte buffer while flushing"); - - // Ran out of buffer space - // Need to throw an exception? - fallback.InternalReset(); - ThrowCharsOverflow(baseDecoder, pTarget == chars); + ThrowConversionOverflow(); } - Debug.Assert(pSrc >= bytes, - "[UTF8Encoding.GetChars]Expected flushing invalid byte sequence to have remained within the byte array"); - ch = 0; } - if (baseDecoder != null) - { - UTF8Decoder decoder = (UTF8Decoder)baseDecoder; + return totalCharCount; + } - // If we're storing flush data we expect all bits to be used or else - // we're stuck in the middle of a conversion - Debug.Assert(!baseDecoder.MustFlush || ch == 0 || !baseDecoder._throwOnOverflow, - "[UTF8Encoding.GetChars]Expected no must flush or no left over bits or no throw on overflow."); + [MethodImpl(MethodImplOptions.AggressiveInlining)] // called directly by GetCharCountCommon + private protected sealed override unsafe int GetCharCountFast(byte* pBytes, int bytesLength, DecoderFallback fallback, out int bytesConsumed) + { + // The number of UTF-16 code units will never exceed the number of UTF-8 code units, + // so the addition at the end of this method will not overflow. - // Remember our leftover bits. - decoder.bits = ch; + byte* ptrToFirstInvalidByte = Utf8Utility.GetPointerToFirstInvalidByte(pBytes, bytesLength, out int utf16CodeUnitCountAdjustment, out _); - baseDecoder._bytesUsed = (int)(pSrc - bytes); - } + int tempBytesConsumed = (int)(ptrToFirstInvalidByte - pBytes); + bytesConsumed = tempBytesConsumed; - // Shouldn't have anything in fallback buffer for GetChars - // (don't have to check _throwOnOverflow for chars) - Debug.Assert(fallback == null || fallback.Remaining == 0, - "[UTF8Encoding.GetChars]Expected empty fallback buffer at end"); - - return PtrDiff(pTarget, chars); + return tempBytesConsumed + utf16CodeUnitCountAdjustment; } - // During GetChars we had an invalid byte sequence - // pSrc is backed up to the start of the bad sequence if we didn't have room to - // fall it back. Otherwise pSrc remains where it is. - private unsafe bool FallbackInvalidByteSequence( - ref byte* pSrc, int ch, DecoderFallbackBuffer fallback, ref char* pTarget) + public override Decoder GetDecoder() { - // Get our byte[] - byte* pStart = pSrc; - byte[] bytesUnknown = GetBytesUnknown(ref pStart, ch); - - // Do the actual fallback - if (!fallback.InternalFallback(bytesUnknown, pSrc, ref pTarget)) - { - // Oops, it failed, back up to pStart - pSrc = pStart; - return false; - } - - // It worked - return true; + return new DecoderNLS(this); } - // During GetCharCount we had an invalid byte sequence - // pSrc is used to find the index that points to the invalid bytes, - // however the byte[] contains the fallback bytes (in case the index is -1) - private unsafe int FallbackInvalidByteSequence( - byte* pSrc, int ch, DecoderFallbackBuffer fallback) + + public override Encoder GetEncoder() { - // Calling GetBytesUnknown can adjust the pSrc pointer but we need to pass the pointer before the adjustment - // to fallback.InternalFallback. The input pSrc to fallback.InternalFallback will only be used to calculate the - // index inside bytesUnknown and if we pass the adjusted pointer we can end up with negative index values. - // We store the original pSrc in pOriginalSrc and then pass pOriginalSrc to fallback.InternalFallback. - byte* pOriginalSrc = pSrc; - - // Get our byte[] - byte[] bytesUnknown = GetBytesUnknown(ref pSrc, ch); - - // Do the actual fallback - int count = fallback.InternalFallback(bytesUnknown, pOriginalSrc); - - // # of fallback chars expected. - // Note that we only get here for "long" sequences, and have already unreserved - // the count that we prereserved for the input bytes - return count; + return new EncoderNLS(this); } - // Note that some of these bytes may have come from a previous fallback, so we cannot - // just decrement the pointer and use the values we read. In those cases we have - // to regenerate the original values. - private unsafe byte[] GetBytesUnknown(ref byte* pSrc, int ch) - { - // Get our byte[] - byte[] bytesUnknown = null; + // + // Beginning of methods used by shared fallback logic. + // - // See if it was a plain char - // (have to check >= 0 because we have all sorts of wierd bit flags) - if (ch < 0x100 && ch >= 0) - { - pSrc--; - bytesUnknown = new byte[] { unchecked((byte)ch) }; - } - // See if its an unfinished 2 byte sequence - else if ((ch & (SupplimentarySeq | ThreeByteSeq)) == 0) - { - pSrc--; - bytesUnknown = new byte[] { unchecked((byte)((ch & 0x1F) | 0xc0)) }; - } - // So now we're either 2nd byte of 3 or 4 byte sequence or - // we hit a non-trail byte or we ran out of space for 3rd byte of 4 byte sequence - // 1st check if its a 4 byte sequence - else if ((ch & SupplimentarySeq) != 0) - { - // 3rd byte of 4 byte sequence? - if ((ch & (FinalByte >> 6)) != 0) - { - // 3rd byte of 4 byte sequence - pSrc -= 3; - bytesUnknown = new byte[] { - unchecked((byte)(((ch >> 12) & 0x07) | 0xF0)), - unchecked((byte)(((ch >> 6) & 0x3F) | 0x80)), - unchecked((byte)(((ch) & 0x3F) | 0x80)) }; - } - else if ((ch & (FinalByte >> 12)) != 0) - { - // 2nd byte of a 4 byte sequence - pSrc -= 2; - bytesUnknown = new byte[] { - unchecked((byte)(((ch >> 6) & 0x07) | 0xF0)), - unchecked((byte)(((ch) & 0x3F) | 0x80)) }; - } - else - { - // 4th byte of a 4 byte sequence - pSrc--; - bytesUnknown = new byte[] { unchecked((byte)(((ch) & 0x07) | 0xF0)) }; - } - } - else - { - // 2nd byte of 3 byte sequence? - if ((ch & (FinalByte >> 6)) != 0) - { - // So its 2nd byte of a 3 byte sequence - pSrc -= 2; - bytesUnknown = new byte[] { - unchecked((byte)(((ch >> 6) & 0x0F) | 0xE0)), unchecked ((byte)(((ch) & 0x3F) | 0x80)) }; - } - else - { - // 1st byte of a 3 byte sequence - pSrc--; - bytesUnknown = new byte[] { unchecked((byte)(((ch) & 0x0F) | 0xE0)) }; - } - } + internal sealed override bool TryGetByteCount(Rune value, out int byteCount) + { + // All well-formed Rune instances can be converted to 1..4 UTF-8 code units. - return bytesUnknown; + byteCount = value.Utf8SequenceLength; + return true; } - - public override Decoder GetDecoder() + internal sealed override OperationStatus EncodeRune(Rune value, Span bytes, out int bytesWritten) { - return new UTF8Decoder(this); - } + // All well-formed Rune instances can be encoded as 1..4 UTF-8 code units. + // If there's an error, it's because the destination was too small. + return value.TryEncodeToUtf8(bytes, out bytesWritten) ? OperationStatus.Done : OperationStatus.DestinationTooSmall; + } - public override Encoder GetEncoder() + internal sealed override OperationStatus DecodeFirstRune(ReadOnlySpan bytes, out Rune value, out int bytesConsumed) { - return new UTF8Encoder(this); + return Rune.DecodeFromUtf8(bytes, out value, out bytesConsumed); } + // + // End of methods used by shared fallback logic. + // public override int GetMaxByteCount(int charCount) { @@ -2571,62 +858,5 @@ namespace System.Text return this.EncoderFallback.GetHashCode() + this.DecoderFallback.GetHashCode() + UTF8_CODEPAGE + (_emitUTF8Identifier ? 1 : 0); } - - private sealed class UTF8Encoder : EncoderNLS - { - // We must save a high surrogate value until the next call, looking - // for a low surrogate value. - internal int surrogateChar; - - public UTF8Encoder(UTF8Encoding encoding) : base(encoding) - { - // base calls reset - } - - public override void Reset() - - { - this.surrogateChar = 0; - if (_fallbackBuffer != null) - _fallbackBuffer.Reset(); - } - - // Anything left in our encoder? - internal override bool HasState - { - get - { - return (this.surrogateChar != 0); - } - } - } - - private sealed class UTF8Decoder : DecoderNLS - { - // We'll need to remember the previous information. See the comments around definition - // of FinalByte for details. - internal int bits; - - public UTF8Decoder(UTF8Encoding encoding) : base(encoding) - { - // base calls reset - } - - public override void Reset() - { - this.bits = 0; - if (_fallbackBuffer != null) - _fallbackBuffer.Reset(); - } - - // Anything left in our decoder? - internal override bool HasState - { - get - { - return (this.bits != 0); - } - } - } } } diff --git a/src/System.Private.CoreLib/shared/System/Text/Unicode/Utf16Utility.Validation.cs b/src/System.Private.CoreLib/shared/System/Text/Unicode/Utf16Utility.Validation.cs new file mode 100644 index 0000000..83f87f9 --- /dev/null +++ b/src/System.Private.CoreLib/shared/System/Text/Unicode/Utf16Utility.Validation.cs @@ -0,0 +1,361 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System.Diagnostics; +using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.X86; +using System.Numerics; +using Internal.Runtime.CompilerServices; + +#if BIT64 +using nint = System.Int64; +using nuint = System.UInt64; +#else // BIT64 +using nint = System.Int32; +using nuint = System.UInt32; +#endif // BIT64 + +namespace System.Text.Unicode +{ + internal static unsafe partial class Utf16Utility + { + // Returns &inputBuffer[inputLength] if the input buffer is valid. + /// + /// Given an input buffer of char length , + /// returns a pointer to where the first invalid data appears in . + /// + /// + /// Returns a pointer to the end of if the buffer is well-formed. + /// + public static char* GetPointerToFirstInvalidChar(char* pInputBuffer, int inputLength, out long utf8CodeUnitCountAdjustment, out int scalarCountAdjustment) + { + // First, we'll handle the common case of all-ASCII. If this is able to + // consume the entire buffer, we'll skip the remainder of this method's logic. + + int numAsciiCharsConsumedJustNow = (int)ASCIIUtility.GetIndexOfFirstNonAsciiChar(pInputBuffer, (uint)inputLength); + Debug.Assert(0 <= numAsciiCharsConsumedJustNow && numAsciiCharsConsumedJustNow <= inputLength); + + pInputBuffer += (uint)numAsciiCharsConsumedJustNow; + inputLength -= numAsciiCharsConsumedJustNow; + + if (inputLength == 0) + { + utf8CodeUnitCountAdjustment = 0; + scalarCountAdjustment = 0; + return pInputBuffer; + } + + // If we got here, it means we saw some non-ASCII data, so within our + // vectorized code paths below we'll handle all non-surrogate UTF-16 + // code points branchlessly. We'll only branch if we see surrogates. + // + // We still optimistically assume the data is mostly ASCII. This means that the + // number of UTF-8 code units and the number of scalars almost matches the number + // of UTF-16 code units. As we go through the input and find non-ASCII + // characters, we'll keep track of these "adjustment" fixups. To get the + // total number of UTF-8 code units required to encode the input data, add + // the UTF-8 code unit count adjustment to the number of UTF-16 code units + // seen. To get the total number of scalars present in the input data, + // add the scalar count adjustment to the number of UTF-16 code units seen. + + long tempUtf8CodeUnitCountAdjustment = 0; + int tempScalarCountAdjustment = 0; + + if (Sse41.IsSupported) + { + if (inputLength >= Vector128.Count) + { + Vector128 vector0080 = Vector128.Create((ushort)0x80); + Vector128 vector0800 = Sse2.ShiftLeftLogical(vector0080, 4); // = 0x0800 + Vector128 vectorA800 = Vector128.Create((ushort)0xA800); + Vector128 vector8800 = Vector128.Create(unchecked((short)0x8800)); + + do + { + Vector128 utf16Data = Sse2.LoadVector128((ushort*)pInputBuffer); + + uint mask = (uint)Sse2.MoveMask( + Sse2.Or( + Sse2.ShiftLeftLogical(Sse41.Min(utf16Data, vector0080), 8), + Sse2.ShiftRightLogical(Sse41.Min(utf16Data, vector0800), 4)).AsByte()); + + // Each odd bit of mask will be 1 only if the char was >= 0x0080, + // and each even bit of mask will be 1 only if the char was >= 0x0800. + // + // Example for UTF-16 input "[ 0123 ] [ 1234 ] ...": + // + // ,-- set if char[1] is non-ASCII + // | ,-- set if char[0] is non-ASCII + // v v + // mask = ... 1 1 1 0 + // ^ ^-- set if char[0] is >= 0x800 + // `-- set if char[1] is >= 0x800 + // + // This means we can popcnt the number of set bits, and the result is the + // number of *additional* UTF-8 bytes that each UTF-16 code unit requires as + // it expands. This results in the wrong count for UTF-16 surrogate code + // units (we just counted that each individual code unit expands to 3 bytes, + // but in reality a well-formed UTF-16 surrogate pair expands to 4 bytes). + // We'll handle this in just a moment. + // + // For now, compute the popcnt but squirrel it away. We'll fold it in to the + // cumulative UTF-8 adjustment factor once we determine that there are no + // unpaired surrogates in our data. (Unpaired surrogates would invalidate + // our computed result and we'd have to throw it away.) + + uint popcnt = (uint)BitOperations.PopCount(mask); + + // Surrogates need to be special-cased for two reasons: (a) we need + // to account for the fact that we over-counted in the addition above; + // and (b) they require separate validation. + + utf16Data = Sse2.Add(utf16Data, vectorA800); + mask = (uint)Sse2.MoveMask(Sse2.CompareLessThan(utf16Data.AsInt16(), vector8800).AsByte()); + + if (mask != 0) + { + // There's at least one UTF-16 surrogate code unit present. + // Since we performed a pmovmskb operation on the result of a 16-bit pcmpgtw, + // the resulting bits of 'mask' will occur in pairs: + // - 00 if the corresponding UTF-16 char was not a surrogate code unit; + // - 11 if the corresponding UTF-16 char was a surrogate code unit. + // + // A UTF-16 high/low surrogate code unit has the bit pattern [ 11011q## ######## ], + // where # is any bit; q = 0 represents a high surrogate, and q = 1 represents + // a low surrogate. Since we added 0xA800 in the vectorized operation above, + // our surrogate pairs will now have the bit pattern [ 10000q## ######## ]. + // If we logical right-shift each word by 3, we'll end up with the bit pattern + // [ 00010000 q####### ], which means that we can immediately use pmovmskb to + // determine whether a given char was a high or a low surrogate. + // + // Therefore the resulting bits of 'mask2' will occur in pairs: + // - 00 if the corresponding UTF-16 char was a high surrogate code unit; + // - 01 if the corresponding UTF-16 char was a low surrogate code unit; + // - ## (garbage) if the corresponding UTF-16 char was not a surrogate code unit. + + uint mask2 = (uint)Sse2.MoveMask(Sse2.ShiftRightLogical(utf16Data, 3).AsByte()); + + uint lowSurrogatesMask = mask2 & mask; // 01 only if was a low surrogate char, else 00 + uint highSurrogatesMask = (mask2 ^ mask) & 0x5555u; // 01 only if was a high surrogate char, else 00 + + // Now check that each high surrogate is followed by a low surrogate and that each + // low surrogate follows a high surrogate. We make an exception for the case where + // the final char of the vector is a high surrogate, since we can't perform validation + // on it until the next iteration of the loop when we hope to consume the matching + // low surrogate. + + highSurrogatesMask <<= 2; + if ((ushort)highSurrogatesMask != lowSurrogatesMask) + { + goto NonVectorizedLoop; // error: mismatched surrogate pair; break out of vectorized logic + } + + if (highSurrogatesMask > ushort.MaxValue) + { + // There was a standalone high surrogate at the end of the vector. + // We'll adjust our counters so that we don't consider this char consumed. + + highSurrogatesMask = (ushort)highSurrogatesMask; // don't allow stray high surrogate to be consumed by popcnt + popcnt -= 2; // the '0xC000_0000' bits in the original mask are shifted out and discarded, so account for that here + pInputBuffer--; + inputLength++; + } + + int surrogatePairsCount = BitOperations.PopCount(highSurrogatesMask); + + // 2 UTF-16 chars become 1 Unicode scalar + + tempScalarCountAdjustment -= surrogatePairsCount; + + // Since each surrogate code unit was >= 0x0800, we eagerly assumed + // it'd be encoded as 3 UTF-8 code units, so our earlier popcnt computation + // assumes that the pair is encoded as 6 UTF-8 code units. Since each + // pair is in reality only encoded as 4 UTF-8 code units, we need to + // perform this adjustment now. + + nint surrogatePairsCountNint = (nint)(nuint)(uint)surrogatePairsCount; // zero-extend to native int size + tempUtf8CodeUnitCountAdjustment -= surrogatePairsCountNint; + tempUtf8CodeUnitCountAdjustment -= surrogatePairsCountNint; + } + + tempUtf8CodeUnitCountAdjustment += popcnt; + pInputBuffer += Vector128.Count; + inputLength -= Vector128.Count; + } while (inputLength >= Vector128.Count); + } + } + else if (Vector.IsHardwareAccelerated) + { + if (inputLength >= Vector.Count) + { + Vector vector0080 = new Vector(0x0080); + Vector vector0400 = new Vector(0x0400); + Vector vector0800 = new Vector(0x0800); + Vector vectorD800 = new Vector(0xD800); + + do + { + // The 'twoOrMoreUtf8Bytes' and 'threeOrMoreUtf8Bytes' vectors will contain + // elements whose values are 0xFFFF (-1 as signed word) iff the corresponding + // UTF-16 code unit was >= 0x0080 and >= 0x0800, respectively. By summing these + // vectors, each element of the sum will contain one of three values: + // + // 0x0000 ( 0) = original char was 0000..007F + // 0xFFFF (-1) = original char was 0080..07FF + // 0xFFFE (-2) = original char was 0800..FFFF + // + // We'll negate them to produce a value 0..2 for each element, then sum all the + // elements together to produce the number of *additional* UTF-8 code units + // required to represent this UTF-16 data. This is similar to the popcnt step + // performed by the SSE41 code path. This will overcount surrogates, but we'll + // handle that shortly. + + Vector utf16Data = Unsafe.ReadUnaligned>(pInputBuffer); + Vector twoOrMoreUtf8Bytes = Vector.GreaterThanOrEqual(utf16Data, vector0080); + Vector threeOrMoreUtf8Bytes = Vector.GreaterThanOrEqual(utf16Data, vector0800); + Vector sumVector = (Vector)(-Vector.Add(twoOrMoreUtf8Bytes, threeOrMoreUtf8Bytes)); + + // We'll try summing by a natural word (rather than a 16-bit word) at a time, + // which should halve the number of operations we must perform. + + nuint popcnt = 0; + for (int i = 0; i < Vector.Count; i++) + { + popcnt += sumVector[i]; + } + + uint popcnt32 = (uint)popcnt; + if (IntPtr.Size == 8) + { + popcnt32 += (uint)(popcnt >> 32); + } + + // As in the SSE4.1 paths, compute popcnt but don't fold it in until we + // know there aren't any unpaired surrogates in the input data. + + popcnt32 = (ushort)popcnt32 + (popcnt32 >> 16); + + // Now check for surrogates. + + utf16Data -= vectorD800; + Vector surrogateChars = Vector.LessThan(utf16Data, vector0800); + if (surrogateChars != Vector.Zero) + { + // There's at least one surrogate (high or low) UTF-16 code unit in + // the vector. We'll build up additional vectors: 'highSurrogateChars' + // and 'lowSurrogateChars', where the elements are 0xFFFF iff the original + // UTF-16 code unit was a high or low surrogate, respectively. + + Vector highSurrogateChars = Vector.LessThan(utf16Data, vector0400); + Vector lowSurrogateChars = Vector.AndNot(surrogateChars, highSurrogateChars); + + // We want to make sure that each high surrogate code unit is followed by + // a low surrogate code unit and each low surrogate code unit follows a + // high surrogate code unit. Since we don't have an equivalent of pmovmskb + // or palignr available to us, we'll do this as a loop. We won't look at + // the very last high surrogate char element since we don't yet know if + // the next vector read will have a low surrogate char element. + + ushort surrogatePairsCount = 0; + for (int i = 0; i < Vector.Count - 1; i++) + { + surrogatePairsCount -= highSurrogateChars[i]; + if (highSurrogateChars[i] != lowSurrogateChars[i + 1]) + { + goto NonVectorizedLoop; // error: mismatched surrogate pair; break out of vectorized logic + } + } + + if (highSurrogateChars[Vector.Count - 1] != 0) + { + // There was a standalone high surrogate at the end of the vector. + // We'll adjust our counters so that we don't consider this char consumed. + + pInputBuffer--; + inputLength++; + popcnt32 -= 2; + tempScalarCountAdjustment--; + } + + nint surrogatePairsCountNint = (nint)surrogatePairsCount; // zero-extend to native int size + + // 2 UTF-16 chars become 1 Unicode scalar + + tempScalarCountAdjustment -= (int)surrogatePairsCountNint; + + // Since each surrogate code unit was >= 0x0800, we eagerly assumed + // it'd be encoded as 3 UTF-8 code units. Each surrogate half is only + // encoded as 2 UTF-8 code units (for 4 UTF-8 code units total), + // so we'll adjust this now. + + tempUtf8CodeUnitCountAdjustment -= surrogatePairsCountNint; + tempUtf8CodeUnitCountAdjustment -= surrogatePairsCountNint; + } + + tempUtf8CodeUnitCountAdjustment += popcnt32; + pInputBuffer += Vector.Count; + inputLength -= Vector.Count; + } while (inputLength >= Vector.Count); + } + } + + NonVectorizedLoop: + + // Vectorization isn't supported on our current platform, or the input was too small to benefit + // from vectorization, or we saw invalid UTF-16 data in the vectorized code paths and need to + // drain remaining valid chars before we report failure. + + for (; inputLength > 0; pInputBuffer++, inputLength--) + { + uint thisChar = pInputBuffer[0]; + if (thisChar <= 0x7F) + { + continue; + } + + // Bump adjustment by +1 for U+0080..U+07FF; by +2 for U+0800..U+FFFF. + // This optimistically assumes no surrogates, which we'll handle shortly. + + tempUtf8CodeUnitCountAdjustment += (thisChar + 0x0001_F800u) >> 16; + + if (!UnicodeUtility.IsSurrogateCodePoint(thisChar)) + { + continue; + } + + // Found a surrogate char. Back out the adjustment we made above, then + // try to consume the entire surrogate pair all at once. We won't bother + // trying to interpret the surrogate pair as a scalar value; we'll only + // validate that its bit pattern matches what's expected for a surrogate pair. + + tempUtf8CodeUnitCountAdjustment -= 2; + + if (inputLength == 1) + { + goto Error; // input buffer too small to read a surrogate pair + } + + thisChar = Unsafe.ReadUnaligned(pInputBuffer); + if (((thisChar - (BitConverter.IsLittleEndian ? 0xDC00_D800u : 0xD800_DC00u)) & 0xFC00_FC00u) != 0) + { + goto Error; // not a well-formed surrogate pair + } + + tempScalarCountAdjustment--; // 2 UTF-16 code units -> 1 scalar + tempUtf8CodeUnitCountAdjustment += 2; // 2 UTF-16 code units -> 4 UTF-8 code units + + pInputBuffer++; // consumed one extra char + inputLength--; + } + + Error: + + // Also used for normal return. + + utf8CodeUnitCountAdjustment = tempUtf8CodeUnitCountAdjustment; + scalarCountAdjustment = tempScalarCountAdjustment; + return pInputBuffer; + } + } +} diff --git a/src/System.Private.CoreLib/shared/System/Text/Utf16Utility.cs b/src/System.Private.CoreLib/shared/System/Text/Unicode/Utf16Utility.cs similarity index 99% rename from src/System.Private.CoreLib/shared/System/Text/Utf16Utility.cs rename to src/System.Private.CoreLib/shared/System/Text/Unicode/Utf16Utility.cs index bed3905..828776b 100644 --- a/src/System.Private.CoreLib/shared/System/Text/Utf16Utility.cs +++ b/src/System.Private.CoreLib/shared/System/Text/Unicode/Utf16Utility.cs @@ -5,7 +5,7 @@ using System.Runtime.CompilerServices; using System.Diagnostics; -namespace System.Text +namespace System.Text.Unicode { internal static partial class Utf16Utility { diff --git a/src/System.Private.CoreLib/shared/System/Text/Unicode/Utf8.cs b/src/System.Private.CoreLib/shared/System/Text/Unicode/Utf8.cs index 657dc17..b4cae37 100644 --- a/src/System.Private.CoreLib/shared/System/Text/Unicode/Utf8.cs +++ b/src/System.Private.CoreLib/shared/System/Text/Unicode/Utf8.cs @@ -39,7 +39,7 @@ namespace System.Text.Unicode /// in will be replaced with U+FFFD in , and /// this method will not return . /// - public static unsafe OperationStatus FromUtf16(ReadOnlySpan source, Span destination, out int numCharsRead, out int numBytesWritten, bool replaceInvalidSequences = true, bool isFinalBlock = true) + public static unsafe OperationStatus FromUtf16(ReadOnlySpan source, Span destination, out int charsRead, out int bytesWritten, bool replaceInvalidSequences = true, bool isFinalBlock = true) { // Throwaway span accesses - workaround for https://github.com/dotnet/coreclr/issues/23437 @@ -116,8 +116,8 @@ namespace System.Text.Unicode // Not possible to make any further progress - report to our caller how far we got. - numCharsRead = (int)(pInputBufferRemaining - pOriginalSource); - numBytesWritten = (int)(pOutputBufferRemaining - pOriginalDestination); + charsRead = (int)(pInputBufferRemaining - pOriginalSource); + bytesWritten = (int)(pOutputBufferRemaining - pOriginalDestination); return operationStatus; } } diff --git a/src/System.Private.CoreLib/shared/System/Text/Unicode/Utf8Utility.Transcoding.cs b/src/System.Private.CoreLib/shared/System/Text/Unicode/Utf8Utility.Transcoding.cs index c9ae2d9..0008d0e 100644 --- a/src/System.Private.CoreLib/shared/System/Text/Unicode/Utf8Utility.Transcoding.cs +++ b/src/System.Private.CoreLib/shared/System/Text/Unicode/Utf8Utility.Transcoding.cs @@ -928,7 +928,7 @@ namespace System.Text.Unicode if (BitConverter.IsLittleEndian && Bmi2.X64.IsSupported) { - const ulong PEXT_MASK = 0x007F007F_007F007Ful; + const ulong PEXT_MASK = 0x00FF00FF_00FF00FFul; // Try reading and writing 8 elements per iteration. uint maxIters = minElementsRemaining / 8; diff --git a/src/System.Private.CoreLib/shared/System/Text/Unicode/Utf8Utility.Validation.cs b/src/System.Private.CoreLib/shared/System/Text/Unicode/Utf8Utility.Validation.cs index 68cd054..cf6c9bc 100644 --- a/src/System.Private.CoreLib/shared/System/Text/Unicode/Utf8Utility.Validation.cs +++ b/src/System.Private.CoreLib/shared/System/Text/Unicode/Utf8Utility.Validation.cs @@ -40,14 +40,13 @@ namespace System.Text.Unicode // Quick check - did we just end up consuming the entire input buffer? // If so, short-circuit the remainder of the method. - if ((int)numAsciiBytesCounted == inputLength) + inputLength -= (int)numAsciiBytesCounted; + if (inputLength == 0) { utf16CodeUnitCountAdjustment = 0; scalarCountAdjustment = 0; return pInputBuffer; } - - inputLength -= (int)numAsciiBytesCounted; } #if DEBUG @@ -604,9 +603,9 @@ namespace System.Text.Unicode Debug.Assert(inputBufferRemainingBytes < 4); while (inputBufferRemainingBytes > 0) { - byte firstByte = pInputBuffer[0]; + uint firstByte = pInputBuffer[0]; - if (firstByte < 0x80u) + if ((byte)firstByte < 0x80u) { // 1-byte (ASCII) case pInputBuffer++; @@ -616,10 +615,10 @@ namespace System.Text.Unicode else if (inputBufferRemainingBytes >= 2) { uint secondByte = pInputBuffer[1]; // typed as 32-bit since we perform arithmetic (not just comparisons) on this value - if (firstByte < 0xE0u) + if ((byte)firstByte < 0xE0u) { // 2-byte case - if (firstByte >= 0xC2u && IsLowByteUtf8ContinuationByte(secondByte)) + if ((byte)firstByte >= 0xC2u && IsLowByteUtf8ContinuationByte(secondByte)) { pInputBuffer += 2; tempUtf16CodeUnitCountAdjustment--; // 2 UTF-8 bytes -> 1 UTF-16 code unit (and 1 scalar) @@ -629,16 +628,16 @@ namespace System.Text.Unicode } else if (inputBufferRemainingBytes >= 3) { - if (firstByte <= 0xF0u) + if ((byte)firstByte < 0xF0u) { - if (firstByte == 0xE0u) + if ((byte)firstByte == 0xE0u) { if (!UnicodeUtility.IsInRangeInclusive(secondByte, 0xA0u, 0xBFu)) { goto Error; // overlong encoding } } - else if (firstByte == 0xEDu) + else if ((byte)firstByte == 0xEDu) { if (!UnicodeUtility.IsInRangeInclusive(secondByte, 0x80u, 0x9Fu)) { diff --git a/src/System.Private.CoreLib/shared/System/Text/Unicode/Utf8Utility.cs b/src/System.Private.CoreLib/shared/System/Text/Unicode/Utf8Utility.cs index 218e79d..d24f766 100644 --- a/src/System.Private.CoreLib/shared/System/Text/Unicode/Utf8Utility.cs +++ b/src/System.Private.CoreLib/shared/System/Text/Unicode/Utf8Utility.cs @@ -39,7 +39,7 @@ namespace System.Text.Unicode int index = (int)(void*)Unsafe.ByteOffset(ref *pUtf8Data, ref *pFirstInvalidByte); isAscii = (utf16CodeUnitCountAdjustment == 0); // If UTF-16 char count == UTF-8 byte count, it's ASCII. - return (index <= utf8Data.Length) ? index : -1; + return (index < utf8Data.Length) ? index : -1; } } -- 2.7.4