From: Levi Broderick Date: Wed, 13 Mar 2019 22:43:25 +0000 (-0700) Subject: Add OperationStatus-based UTF8 transcoding APIs (dotnet/coreclr#23219) X-Git-Tag: submit/tizen/20210909.063632~11030^2~2181 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=cdd4b571b71d934878f1cf48516ef5a6f420c190;p=platform%2Fupstream%2Fdotnet%2Fruntime.git Add OperationStatus-based UTF8 transcoding APIs (dotnet/coreclr#23219) Commit migrated from https://github.com/dotnet/coreclr/commit/fdd611cbcdc0f7d922dd111e5e1831663c67e685 --- diff --git a/src/libraries/System.Private.CoreLib/src/System.Private.CoreLib.Shared.projitems b/src/libraries/System.Private.CoreLib/src/System.Private.CoreLib.Shared.projitems index 133f169..91b6527 100644 --- a/src/libraries/System.Private.CoreLib/src/System.Private.CoreLib.Shared.projitems +++ b/src/libraries/System.Private.CoreLib/src/System.Private.CoreLib.Shared.projitems @@ -798,6 +798,7 @@ + diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8.cs b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8.cs new file mode 100644 index 0000000..0b94192 --- /dev/null +++ b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8.cs @@ -0,0 +1,198 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System.Buffers; +using System.Diagnostics; + +namespace System.Text.Unicode +{ + public static class Utf8 + { + /* + * OperationStatus-based APIs for transcoding of chunked data. + * This method is similar to Encoding.UTF8.GetBytes / GetChars but has a + * different calling convention, different error handling mechanisms, and + * different performance characteristics. + * + * If 'replaceInvalidSequences' is true, the method will replace any ill-formed + * subsequence in the source with U+FFFD when transcoding to the destination, + * then it will continue processing the remainder of the buffers. Otherwise + * the method will return OperationStatus.InvalidData. + * + * If the method does return an error code, the out parameters will represent + * how much of the data was successfully transcoded, and the location of the + * ill-formed subsequence can be deduced from these values. + * + * If 'replaceInvalidSequences' is true, the method is guaranteed never to return + * OperationStatus.InvalidData. If 'isFinalBlock' is true, the method is + * guaranteed never to return OperationStatus.NeedMoreData. + */ + + /// + /// Transcodes the UTF-16 buffer to as UTF-8. + /// + /// + /// If is , invalid UTF-16 sequences + /// in will be replaced with U+FFFD in , and + /// this method will not return . + /// + public static OperationStatus FromUtf16(ReadOnlySpan source, Span destination, out int numCharsRead, out int numBytesWritten, bool replaceInvalidSequences = true, bool isFinalBlock = true) + { + int originalSourceLength = source.Length; + int originalDestinationLength = destination.Length; + OperationStatus status = OperationStatus.Done; + + // In a loop, this is going to read and transcode one scalar value at a time + // from the source to the destination. + + while (!source.IsEmpty) + { + status = Rune.DecodeUtf16(source, out Rune firstScalarValue, out int charsConsumed); + + switch (status) + { + case OperationStatus.NeedMoreData: + + // Input buffer ended with a high surrogate. Only treat this as an error + // if the caller told us that we shouldn't expect additional data in a + // future call. + + if (!isFinalBlock) + { + goto Finish; + } + + status = OperationStatus.InvalidData; + goto case OperationStatus.InvalidData; + + case OperationStatus.InvalidData: + + // Input buffer contained invalid data. If the caller told us not to + // perform U+FFFD replacement, terminate the loop immediately and return + // an error to the caller. + + if (!replaceInvalidSequences) + { + goto Finish; + } + + firstScalarValue = Rune.ReplacementChar; + goto default; + + default: + + // We know which scalar value we need to transcode to UTF-8. + // Do so now, and only terminate the loop if we ran out of space + // in the destination buffer. + + if (firstScalarValue.TryEncodeToUtf8Bytes(destination, out int bytesWritten)) + { + source = source.Slice(charsConsumed); // don't use Rune.Utf8SequenceLength; we may have performed substitution + destination = destination.Slice(bytesWritten); + status = OperationStatus.Done; // forcibly set success + continue; + } + else + { + status = OperationStatus.DestinationTooSmall; + goto Finish; + } + } + } + + Finish: + + numCharsRead = originalSourceLength - source.Length; + numBytesWritten = originalDestinationLength - destination.Length; + + Debug.Assert(numCharsRead < originalSourceLength || status != OperationStatus.Done, + "Cannot report OperationStatus.Done if we haven't consumed the entire input buffer."); + + return status; + } + + /// + /// Transcodes the UTF-8 buffer to as UTF-16. + /// + /// + /// If is , invalid UTF-8 sequences + /// in will be replaced with U+FFFD in , and + /// this method will not return . + /// + public static OperationStatus ToUtf16(ReadOnlySpan source, Span destination, out int numBytesRead, out int numCharsWritten, bool replaceInvalidSequences = true, bool isFinalBlock = true) + { + int originalSourceLength = source.Length; + int originalDestinationLength = destination.Length; + OperationStatus status = OperationStatus.Done; + + // In a loop, this is going to read and transcode one scalar value at a time + // from the source to the destination. + + while (!source.IsEmpty) + { + status = Rune.DecodeUtf8(source, out Rune firstScalarValue, out int bytesConsumed); + + switch (status) + { + case OperationStatus.NeedMoreData: + + // Input buffer ended with a partial UTF-8 sequence. Only treat this as an error + // if the caller told us that we shouldn't expect additional data in a + // future call. + + if (!isFinalBlock) + { + goto Finish; + } + + status = OperationStatus.InvalidData; + goto case OperationStatus.InvalidData; + + case OperationStatus.InvalidData: + + // Input buffer contained invalid data. If the caller told us not to + // perform U+FFFD replacement, terminate the loop immediately and return + // an error to the caller. + + if (!replaceInvalidSequences) + { + goto Finish; + } + + firstScalarValue = Rune.ReplacementChar; + goto default; + + default: + + // We know which scalar value we need to transcode to UTF-16. + // Do so now, and only terminate the loop if we ran out of space + // in the destination buffer. + + if (firstScalarValue.TryEncode(destination, out int charsWritten)) + { + source = source.Slice(bytesConsumed); // don't use Rune.Utf16SequenceLength; we may have performed substitution + destination = destination.Slice(charsWritten); + status = OperationStatus.Done; // forcibly set success + continue; + } + else + { + status = OperationStatus.DestinationTooSmall; + goto Finish; + } + } + } + + Finish: + + numBytesRead = originalSourceLength - source.Length; + numCharsWritten = originalDestinationLength - destination.Length; + + Debug.Assert(numBytesRead < originalSourceLength || status != OperationStatus.Done, + "Cannot report OperationStatus.Done if we haven't consumed the entire input buffer."); + + return status; + } + } +}