Add OperationStatus-based UTF8 transcoding APIs (dotnet/coreclr#23219)

author Levi Broderick <GrabYourPitchforks@users.noreply.github.com>

Wed, 13 Mar 2019 22:43:25 +0000 (15:43 -0700)

committer GitHub <noreply@github.com>

Wed, 13 Mar 2019 22:43:25 +0000 (15:43 -0700)
author Levi Broderick <GrabYourPitchforks@users.noreply.github.com>
Wed, 13 Mar 2019 22:43:25 +0000 (15:43 -0700)
committer GitHub <noreply@github.com>
Wed, 13 Mar 2019 22:43:25 +0000 (15:43 -0700)
diff --git a/src/libraries/System.Private.CoreLib/src/System.Private.CoreLib.Shared.projitems b/src/libraries/System.Private.CoreLib/src/System.Private.CoreLib.Shared.projitems

index 133f169..91b6527 100644 (file)
--- a/src/libraries/System.Private.CoreLib/src/System.Private.CoreLib.Shared.projitems
+++ b/src/libraries/System.Private.CoreLib/src/System.Private.CoreLib.Shared.projitems
@@ -798,6 +798,7 @@
      <Compile Include="$(MSBuildThisFileDirectory)System\Text\UTF7Encoding.cs" />
      <Compile Include="$(MSBuildThisFileDirectory)System\Text\UTF8Encoding.cs" />
      <Compile Include="$(MSBuildThisFileDirectory)System\Text\ValueStringBuilder.cs" />
+    <Compile Include="$(MSBuildThisFileDirectory)System\Text\Unicode\Utf8.cs" />
      <Compile Include="$(MSBuildThisFileDirectory)System\TimeSpan.cs" />
      <Compile Include="$(MSBuildThisFileDirectory)System\ThreadAttributes.cs" />
      <Compile Include="$(MSBuildThisFileDirectory)System\Threading\AbandonedMutexException.cs" />
diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8.cs b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8.cs

new file mode 100644 (file)

index 0000000..0b94192
--- /dev/null
+++ b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8.cs
@@ -0,0 +1,198 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System.Buffers;
+using System.Diagnostics;
+
+namespace System.Text.Unicode
+{
+    public static class Utf8
+    {
+        /*
+         * OperationStatus-based APIs for transcoding of chunked data.
+         * This method is similar to Encoding.UTF8.GetBytes / GetChars but has a
+         * different calling convention, different error handling mechanisms, and
+         * different performance characteristics.
+         *
+         * If 'replaceInvalidSequences' is true, the method will replace any ill-formed
+         * subsequence in the source with U+FFFD when transcoding to the destination,
+         * then it will continue processing the remainder of the buffers. Otherwise
+         * the method will return OperationStatus.InvalidData.
+         *
+         * If the method does return an error code, the out parameters will represent
+         * how much of the data was successfully transcoded, and the location of the
+         * ill-formed subsequence can be deduced from these values.
+         *
+         * If 'replaceInvalidSequences' is true, the method is guaranteed never to return
+         * OperationStatus.InvalidData. If 'isFinalBlock' is true, the method is
+         * guaranteed never to return OperationStatus.NeedMoreData.
+         */
+
+        /// <summary>
+        /// Transcodes the UTF-16 <paramref name="source"/> buffer to <paramref name="destination"/> as UTF-8.
+        /// </summary>
+        /// <remarks>
+        /// If <paramref name="replaceInvalidSequences"/> is <see langword="true"/>, invalid UTF-16 sequences
+        /// in <paramref name="source"/> will be replaced with U+FFFD in <paramref name="destination"/>, and
+        /// this method will not return <see cref="OperationStatus.InvalidData"/>.
+        /// </remarks>
+        public static OperationStatus FromUtf16(ReadOnlySpan<char> source, Span<byte> destination, out int numCharsRead, out int numBytesWritten, bool replaceInvalidSequences = true, bool isFinalBlock = true)
+        {
+            int originalSourceLength = source.Length;
+            int originalDestinationLength = destination.Length;
+            OperationStatus status = OperationStatus.Done;
+
+            // In a loop, this is going to read and transcode one scalar value at a time
+            // from the source to the destination.
+
+            while (!source.IsEmpty)
+            {
+                status = Rune.DecodeUtf16(source, out Rune firstScalarValue, out int charsConsumed);
+
+                switch (status)
+                {
+                    case OperationStatus.NeedMoreData:
+
+                        // Input buffer ended with a high surrogate. Only treat this as an error
+                        // if the caller told us that we shouldn't expect additional data in a
+                        // future call.
+
+                        if (!isFinalBlock)
+                        {
+                            goto Finish;
+                        }
+
+                        status = OperationStatus.InvalidData;
+                        goto case OperationStatus.InvalidData;
+
+                    case OperationStatus.InvalidData:
+
+                        // Input buffer contained invalid data. If the caller told us not to
+                        // perform U+FFFD replacement, terminate the loop immediately and return
+                        // an error to the caller.
+
+                        if (!replaceInvalidSequences)
+                        {
+                            goto Finish;
+                        }
+
+                        firstScalarValue = Rune.ReplacementChar;
+                        goto default;
+
+                    default:
+
+                        // We know which scalar value we need to transcode to UTF-8.
+                        // Do so now, and only terminate the loop if we ran out of space
+                        // in the destination buffer.
+
+                        if (firstScalarValue.TryEncodeToUtf8Bytes(destination, out int bytesWritten))
+                        {
+                            source = source.Slice(charsConsumed); // don't use Rune.Utf8SequenceLength; we may have performed substitution
+                            destination = destination.Slice(bytesWritten);
+                            status = OperationStatus.Done; // forcibly set success
+                            continue;
+                        }
+                        else
+                        {
+                            status = OperationStatus.DestinationTooSmall;
+                            goto Finish;
+                        }
+                }
+            }
+
+        Finish:
+
+            numCharsRead = originalSourceLength - source.Length;
+            numBytesWritten = originalDestinationLength - destination.Length;
+
+            Debug.Assert(numCharsRead < originalSourceLength || status != OperationStatus.Done,
+                "Cannot report OperationStatus.Done if we haven't consumed the entire input buffer.");
+
+            return status;
+        }
+
+        /// <summary>
+        /// Transcodes the UTF-8 <paramref name="source"/> buffer to <paramref name="destination"/> as UTF-16.
+        /// </summary>
+        /// <remarks>
+        /// If <paramref name="replaceInvalidSequences"/> is <see langword="true"/>, invalid UTF-8 sequences
+        /// in <paramref name="source"/> will be replaced with U+FFFD in <paramref name="destination"/>, and
+        /// this method will not return <see cref="OperationStatus.InvalidData"/>.
+        /// </remarks>
+        public static OperationStatus ToUtf16(ReadOnlySpan<byte> source, Span<char> destination, out int numBytesRead, out int numCharsWritten, bool replaceInvalidSequences = true, bool isFinalBlock = true)
+        {
+            int originalSourceLength = source.Length;
+            int originalDestinationLength = destination.Length;
+            OperationStatus status = OperationStatus.Done;
+
+            // In a loop, this is going to read and transcode one scalar value at a time
+            // from the source to the destination.
+
+            while (!source.IsEmpty)
+            {
+                status = Rune.DecodeUtf8(source, out Rune firstScalarValue, out int bytesConsumed);
+
+                switch (status)
+                {
+                    case OperationStatus.NeedMoreData:
+
+                        // Input buffer ended with a partial UTF-8 sequence. Only treat this as an error
+                        // if the caller told us that we shouldn't expect additional data in a
+                        // future call.
+
+                        if (!isFinalBlock)
+                        {
+                            goto Finish;
+                        }
+
+                        status = OperationStatus.InvalidData;
+                        goto case OperationStatus.InvalidData;
+
+                    case OperationStatus.InvalidData:
+
+                        // Input buffer contained invalid data. If the caller told us not to
+                        // perform U+FFFD replacement, terminate the loop immediately and return
+                        // an error to the caller.
+
+                        if (!replaceInvalidSequences)
+                        {
+                            goto Finish;
+                        }
+
+                        firstScalarValue = Rune.ReplacementChar;
+                        goto default;
+
+                    default:
+
+                        // We know which scalar value we need to transcode to UTF-16.
+                        // Do so now, and only terminate the loop if we ran out of space
+                        // in the destination buffer.
+
+                        if (firstScalarValue.TryEncode(destination, out int charsWritten))
+                        {
+                            source = source.Slice(bytesConsumed); // don't use Rune.Utf16SequenceLength; we may have performed substitution
+                            destination = destination.Slice(charsWritten);
+                            status = OperationStatus.Done; // forcibly set success
+                            continue;
+                        }
+                        else
+                        {
+                            status = OperationStatus.DestinationTooSmall;
+                            goto Finish;
+                        }
+                }
+            }
+
+        Finish:
+
+            numBytesRead = originalSourceLength - source.Length;
+            numCharsWritten = originalDestinationLength - destination.Length;
+
+            Debug.Assert(numBytesRead < originalSourceLength || status != OperationStatus.Done,
+                "Cannot report OperationStatus.Done if we haven't consumed the entire input buffer.");
+
+            return status;
+        }
+    }
+}
author	Levi Broderick <GrabYourPitchforks@users.noreply.github.com>
	Wed, 13 Mar 2019 22:43:25 +0000 (15:43 -0700)
committer	GitHub <noreply@github.com>
	Wed, 13 Mar 2019 22:43:25 +0000 (15:43 -0700)
src/libraries/System.Private.CoreLib/src/System.Private.CoreLib.Shared.projitems		patch \| blob \| history
src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8.cs	[new file with mode: 0644]	patch \| blob