From 748ad60dff0b14f4aabe08f140ac74c73709cbec Mon Sep 17 00:00:00 2001
From: Levi Broderick <GrabYourPitchforks@users.noreply.github.com>
Date: Fri, 21 Jun 2019 20:38:05 -0700
Subject: [PATCH] Add UTF-8 support to TextEncoder (dotnet/corefx#38356)

Commit migrated from https://github.com/dotnet/corefx/commit/9b486c1ea7bcfe7d09206ae449bffb049754ade5
---
 .../src/Common/TextEncoderExtensions.cs            |  71 ++++++
 .../src/Properties/InternalsVisibleTo.cs           |  10 +
 .../src/System.Text.Encodings.Web.csproj           |  11 +
 .../src/System/Text/Encodings/Web/TextEncoder.cs   | 235 +++++++++++++++++++
 .../src/System/Text/Unicode/UnicodeHelpers.cs      | 254 +++++++++++++++++++++
 .../tests/CommonTestEncoder.cs                     | 105 ---------
 .../tests/ConfigurableScalarTextEncoder.cs         |  38 +++
 .../tests/System.Text.Encodings.Web.Tests.csproj   |  17 +-
 .../tests/TextEncoderTests.cs                      | 242 +++++++++++++++++++-
 9 files changed, 870 insertions(+), 113 deletions(-)
 create mode 100644 src/libraries/System.Text.Encodings.Web/src/Common/TextEncoderExtensions.cs
 create mode 100644 src/libraries/System.Text.Encodings.Web/src/Properties/InternalsVisibleTo.cs
 delete mode 100644 src/libraries/System.Text.Encodings.Web/tests/CommonTestEncoder.cs
 create mode 100644 src/libraries/System.Text.Encodings.Web/tests/ConfigurableScalarTextEncoder.cs
diff --git a/src/libraries/System.Text.Encodings.Web/src/Common/TextEncoderExtensions.cs b/src/libraries/System.Text.Encodings.Web/src/Common/TextEncoderExtensions.cs
new file mode 100644
index 0000000..6f7f7fe
--- /dev/null
+++ b/src/libraries/System.Text.Encodings.Web/src/Common/TextEncoderExtensions.cs
@@ -0,0 +1,71 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System.Buffers;
+using System.Diagnostics;
+using System.Reflection;
+using System.Runtime.CompilerServices;
+
+namespace System.Text.Encodings.Web
+{
+    /// <summary>
+    /// Provides access to <see cref="TextEncoder"/> APIs that aren't part of the ref asms.
+    /// </summary>
+    internal static class TextEncoderExtensions
+    {
+        private delegate OperationStatus EncodeUtf8Del(TextEncoder encoder, ReadOnlySpan<byte> utf8Source, Span<byte> utf8Destination, out int bytesConsumed, out int bytesWritten, bool isFinalBlock);
+        private delegate int FindFirstCharacterToEncodeUtf8Del(TextEncoder encoder, ReadOnlySpan<byte> utf8Text);
+
+        private static readonly EncodeUtf8Del s_encodeUtf8Fn = CreateEncodeUtf8Fn();
+        private static readonly FindFirstCharacterToEncodeUtf8Del s_findFirstCharToEncodeUtf8Fn = CreateFindFirstCharToEncodeUtf8Fn();
+
+        private static EncodeUtf8Del CreateEncodeUtf8Fn()
+        {
+            // Locate the shim method, which is able to perform fast virtual dispatch,
+            // then create a delegate to it.
+
+            MethodInfo methodInfo = typeof(TextEncoder).GetMethod("EncodeUtf8Shim", BindingFlags.NonPublic | BindingFlags.Static);
+            Debug.Assert(methodInfo != null);
+            EncodeUtf8Del del = (EncodeUtf8Del)methodInfo.CreateDelegate(typeof(EncodeUtf8Del));
+
+            // Now invoke the delegate once. The reason for this is that the delegate probably
+            // points to the pre-jit stub rather than the final codegen for the method, which
+            // means that invocations of this delegate will incur an unnecessary call back into
+            // the VM. Invoking the delegate forces JIT to take place now, so a future delegate
+            // will point directly to the codegen rather than the pre-jit stub.
+
+            del(HtmlEncoder.Default, ReadOnlySpan<byte>.Empty, Span<byte>.Empty, out _, out _, false);
+
+            // Now create the delegate again and return it to the caller.
+            // The delegate should now be pointing directly to the static method's codegen.
+
+            return (EncodeUtf8Del)methodInfo.CreateDelegate(typeof(EncodeUtf8Del));
+        }
+
+        private static FindFirstCharacterToEncodeUtf8Del CreateFindFirstCharToEncodeUtf8Fn()
+        {
+            // See the comments in CreateEncodeUtf8Fn for an overview of how this logic works.
+
+            MethodInfo methodInfo = typeof(TextEncoder).GetMethod("FindFirstCharacterToEncodeUtf8Shim", BindingFlags.NonPublic | BindingFlags.Static);
+            Debug.Assert(methodInfo != null);
+
+            FindFirstCharacterToEncodeUtf8Del del = (FindFirstCharacterToEncodeUtf8Del)methodInfo.CreateDelegate(typeof(FindFirstCharacterToEncodeUtf8Del));
+            del(HtmlEncoder.Default, ReadOnlySpan<byte>.Empty);
+
+            return (FindFirstCharacterToEncodeUtf8Del)methodInfo.CreateDelegate(typeof(FindFirstCharacterToEncodeUtf8Del));
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        internal static OperationStatus EncodeUtf8(this TextEncoder encoder, ReadOnlySpan<byte> utf8Source, Span<byte> utf8Destination, out int bytesConsumed, out int bytesWritten, bool isFinalBlock = true)
+        {
+            return s_encodeUtf8Fn(encoder, utf8Source, utf8Destination, out bytesConsumed, out bytesWritten, isFinalBlock);
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        internal static int FindFirstCharacterToEncodeUtf8(this TextEncoder encoder, ReadOnlySpan<byte> utf8Text)
+        {
+            return s_findFirstCharToEncodeUtf8Fn(encoder, utf8Text);
+        }
+    }
+}
diff --git a/src/libraries/System.Text.Encodings.Web/src/Properties/InternalsVisibleTo.cs b/src/libraries/System.Text.Encodings.Web/src/Properties/InternalsVisibleTo.cs
new file mode 100644
index 0000000..21107aa
--- /dev/null
+++ b/src/libraries/System.Text.Encodings.Web/src/Properties/InternalsVisibleTo.cs
@@ -0,0 +1,10 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System.Runtime.CompilerServices;
+
+// This attribute only exists to prevent the build system from tree shaking
+// internal members out of the final compiled binary.
+
+[assembly: InternalsVisibleTo("System.Text.Encodings.Web.Tests, PublicKey=00240000048000009400000006020000002400005253413100040000010001004b86c4cb78549b34bab61a3b1800e23bfeb5b3ec390074041536a7e3cbd97f5f04cf0f857155a8928eaa29ebfd11cfbbad3ba70efea7bda3226c6a8d370a4cd303f714486b6ebc225985a638471e6ef571cc92a4613c00b8fa65d61ccee0cbe5f36330c9a01f4183559f1bef24cc2917c6d913e3a541333a1d05d9bed22b38cb")]
diff --git a/src/libraries/System.Text.Encodings.Web/src/System.Text.Encodings.Web.csproj b/src/libraries/System.Text.Encodings.Web/src/System.Text.Encodings.Web.csproj
index d918978..e27938f 100644
--- a/src/libraries/System.Text.Encodings.Web/src/System.Text.Encodings.Web.csproj
+++ b/src/libraries/System.Text.Encodings.Web/src/System.Text.Encodings.Web.csproj
@@ -6,6 +6,9 @@
     <Configurations>netcoreapp-Debug;netcoreapp-Release;netstandard-Debug;netstandard-Release;uap-Windows_NT-Debug;uap-Windows_NT-Release</Configurations>
   </PropertyGroup>
   <ItemGroup>
+    <Compile Include="Properties\InternalsVisibleTo.cs" />
+  </ItemGroup>
+  <ItemGroup>
     <Compile Include="System\Text\Encodings\Web\HexUtil.cs" />
     <Compile Include="System\Text\Encodings\Web\HtmlEncoder.cs" />
     <Compile Include="System\Text\Encodings\Web\JavaScriptEncoder.cs" />
@@ -20,6 +23,14 @@
     <Compile Include="System\Text\Unicode\UnicodeRanges.generated.cs" />
   </ItemGroup>
   <ItemGroup>
+    <Compile Include="$(CommonPath)\CoreLib\System\Text\UnicodeDebug.cs">
+      <Link>System\Text\UnicodeDebug.cs</Link>
+    </Compile>
+    <Compile Include="$(CommonPath)\CoreLib\System\Text\UnicodeUtility.cs">
+      <Link>System\Text\UnicodeUtility.cs</Link>
+    </Compile>
+  </ItemGroup>
+  <ItemGroup>
     <Reference Include="System.Memory" />
   </ItemGroup>
   <ItemGroup Condition="'$(TargetsNetCoreApp)' == 'true' OR '$(TargetsUap)' == 'true'">
diff --git a/src/libraries/System.Text.Encodings.Web/src/System/Text/Encodings/Web/TextEncoder.cs b/src/libraries/System.Text.Encodings.Web/src/System/Text/Encodings/Web/TextEncoder.cs
index b98268f..b9eddfc 100644
--- a/src/libraries/System.Text.Encodings.Web/src/System/Text/Encodings/Web/TextEncoder.cs
+++ b/src/libraries/System.Text.Encodings.Web/src/System/Text/Encodings/Web/TextEncoder.cs
@@ -2,10 +2,12 @@
 // The .NET Foundation licenses this file to you under the MIT license.
 // See the LICENSE file in the project root for more information.
 
+using System.Buffers;
 using System.ComponentModel;
 using System.Diagnostics;
 using System.IO;
 using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
 using System.Text.Unicode;
 
 namespace System.Text.Encodings.Web
@@ -322,6 +324,190 @@ namespace System.Text.Encodings.Web
             }
         }
 
+        /// <summary>
+        /// Encodes the supplied UTF-8 text.
+        /// </summary>
+        /// <param name="utf8Source">A source buffer containing the UTF-8 text to encode.</param>
+        /// <param name="utf8Destination">The destination buffer to which the encoded form of <paramref name="utf8Source"/>
+        /// will be written.</param>
+        /// <param name="bytesConsumed">The number of bytes consumed from the <paramref name="utf8Source"/> buffer.</param>
+        /// <param name="bytesWritten">The number of bytes written to the <paramref name="utf8Destination"/> buffer.</param>
+        /// <param name="isFinalBlock"><see langword="true"/> if there is further source data that needs to be encoded;
+        /// <see langword="false"/> if there is no further source data that needs to be encoded.</param>
+        /// <returns>An <see cref="OperationStatus"/> describing the result of the encoding operation.</returns>
+        /// <remarks>The buffers <paramref name="utf8Source"/> and <paramref name="utf8Destination"/> must not overlap.</remarks>
+        internal unsafe virtual OperationStatus EncodeUtf8(ReadOnlySpan<byte> utf8Source, Span<byte> utf8Destination, out int bytesConsumed, out int bytesWritten, bool isFinalBlock = true)
+        {
+            // Optimization: Detect how much "doesn't require escaping" data exists at the beginning of the buffer,
+            // and memcpy it directly to the destination.
+
+            int numBytesToCopy = FindFirstCharacterToEncodeUtf8(utf8Source);
+            if (numBytesToCopy < 0)
+            {
+                numBytesToCopy = utf8Source.Length;
+            }
+
+            if (!utf8Source.Slice(0, numBytesToCopy).TryCopyTo(utf8Destination))
+            {
+                // There wasn't enough room in the destination to copy over the entire source buffer.
+                // We'll instead copy over as much as we can and return DestinationTooSmall. We do need to
+                // account for the fact that we don't want to truncate a multi-byte UTF-8 subsequence
+                // mid-sequence (since a subsequent slice and call to EncodeUtf8 would produce invalid
+                // data).
+
+                utf8Source = utf8Source.Slice(0, utf8Destination.Length + 1); // guaranteed not to fail since utf8Source is larger than utf8Destination
+                for (int i = utf8Source.Length - 1; i >= 0; i--)
+                {
+                    if (!UnicodeHelpers.IsUtf8ContinuationByte(in utf8Source[i]))
+                    {
+                        utf8Source.Slice(0, i).CopyTo(utf8Destination);
+                        bytesConsumed = i;
+                        bytesWritten = i;
+                        return OperationStatus.DestinationTooSmall;
+                    }
+                }
+
+                // If we got to this point, either somebody mutated the input buffer out from under us, or
+                // the FindFirstCharacterToEncodeUtf8 method was overridden incorrectly such that it attempted
+                // to skip over ill-formed data. In either case we don't know how to perform a partial memcpy
+                // so we shouldn't do anything at all. We'll return DestinationTooSmall here since the caller
+                // can resolve the issue by increasing the size of the destination buffer so that it's at least
+                // as large as the input buffer, which would skip over this entire code path.
+
+                bytesConsumed = 0;
+                bytesWritten = 0;
+                return OperationStatus.DestinationTooSmall;
+            }
+
+            // If we copied over all of the input data, success!
+
+            if (numBytesToCopy == utf8Source.Length)
+            {
+                bytesConsumed = numBytesToCopy;
+                bytesWritten = numBytesToCopy;
+                return OperationStatus.Done;
+            }
+
+            // There's data that must be encoded. Fall back to the scalar-by-scalar slow path.
+
+            int originalUtf8SourceLength = utf8Source.Length;
+            int originalUtf8DestinationLength = utf8Destination.Length;
+
+            utf8Source = utf8Source.Slice(numBytesToCopy);
+            utf8Destination = utf8Destination.Slice(numBytesToCopy);
+
+            const int TempUtf16CharBufferLength = 24; // arbitrarily chosen, but sufficient for any reasonable implementation
+            char* pTempCharBuffer = stackalloc char[TempUtf16CharBufferLength];
+
+            const int TempUtf8ByteBufferLength = TempUtf16CharBufferLength * 3 /* max UTF-8 output code units per UTF-16 input code unit */;
+            byte* pTempUtf8Buffer = stackalloc byte[TempUtf8ByteBufferLength];
+
+            while (!utf8Source.IsEmpty)
+            {
+                OperationStatus opStatus = UnicodeHelpers.DecodeScalarValueFromUtf8(utf8Source, out uint nextScalarValue, out int bytesConsumedThisIteration);
+
+                switch (opStatus)
+                {
+                    case OperationStatus.Done:
+
+                        if (WillEncode((int)nextScalarValue))
+                        {
+                            goto default; // source data must be transcoded
+                        }
+                        else
+                        {
+                            // Source data can be copied as-is. Attempt to memcpy it to the destination buffer.
+
+                            if (utf8Source.Slice(0, bytesConsumedThisIteration).TryCopyTo(utf8Destination))
+                            {
+                                utf8Destination = utf8Destination.Slice(bytesConsumedThisIteration);
+                            }
+                            else
+                            {
+                                goto ReturnDestinationTooSmall;
+                            }
+                        }
+
+                        break;
+
+                    case OperationStatus.NeedMoreData:
+
+                        if (isFinalBlock)
+                        {
+                            goto default; // treat this as a normal invalid subsequence
+                        }
+                        else
+                        {
+                            goto ReturnNeedMoreData;
+                        }
+
+                    default:
+
+                        // This code path is hit for ill-formed input data (where decoding has replaced it with U+FFFD)
+                        // and for well-formed input data that must be escaped.
+
+                        if (TryEncodeUnicodeScalar((int)nextScalarValue, pTempCharBuffer, TempUtf16CharBufferLength, out int charsWrittenJustNow))
+                        {
+                            // Now that we have it as UTF-16, transcode it to UTF-8.
+                            // Need to copy it to a temporary buffer first, otherwise GetBytes might throw an exception
+                            // due to lack of output space.
+
+                            int transcodedByteCountThisIteration = Encoding.UTF8.GetBytes(pTempCharBuffer, charsWrittenJustNow, pTempUtf8Buffer, TempUtf8ByteBufferLength);
+                            ReadOnlySpan<byte> transcodedUtf8BytesThisIteration = new ReadOnlySpan<byte>(pTempUtf8Buffer, transcodedByteCountThisIteration);
+
+                            if (!transcodedUtf8BytesThisIteration.TryCopyTo(utf8Destination))
+                            {
+                                goto ReturnDestinationTooSmall;
+                            }
+
+                            utf8Destination = utf8Destination.Slice(transcodedByteCountThisIteration); // advance destination buffer
+                        }
+                        else
+                        {
+                            // We really don't expect this to fail. If that happens we'll report an error to our caller.
+
+                            goto ReturnInvalidData;
+                        }
+
+                        break;
+                }
+
+                utf8Source = utf8Source.Slice(bytesConsumedThisIteration);
+            }
+
+            // Input buffer has been fully processed!
+
+            bytesConsumed = originalUtf8SourceLength;
+            bytesWritten = originalUtf8DestinationLength - utf8Destination.Length;
+            return OperationStatus.Done;
+
+        ReturnDestinationTooSmall:
+
+            bytesConsumed = originalUtf8SourceLength - utf8Source.Length;
+            bytesWritten = originalUtf8DestinationLength - utf8Destination.Length;
+            return OperationStatus.DestinationTooSmall;
+
+        ReturnNeedMoreData:
+
+            bytesConsumed = originalUtf8SourceLength - utf8Source.Length;
+            bytesWritten = originalUtf8DestinationLength - utf8Destination.Length;
+            return OperationStatus.NeedMoreData;
+
+        ReturnInvalidData:
+
+            bytesConsumed = originalUtf8SourceLength - utf8Source.Length;
+            bytesWritten = originalUtf8DestinationLength - utf8Destination.Length;
+            return OperationStatus.InvalidData;
+        }
+
+        /// <summary>
+        /// Shim function which can call virtual method <see cref="EncodeUtf8"/> using fast dispatch.
+        /// </summary>
+        internal static OperationStatus EncodeUtf8Shim(TextEncoder encoder, ReadOnlySpan<byte> utf8Source, Span<byte> utf8Destination, out int bytesConsumed, out int bytesWritten, bool isFinalBlock)
+        {
+            return encoder.EncodeUtf8(utf8Source, utf8Destination, out bytesConsumed, out bytesWritten, isFinalBlock);
+        }
+
         private unsafe void EncodeCore(TextWriter output, char* value, int valueLength)
         {
             Debug.Assert(value != null & output != null);
@@ -383,6 +569,55 @@ namespace System.Text.Encodings.Web
             }
         }
 
+        private unsafe int FindFirstCharacterToEncode(ReadOnlySpan<char> text)
+        {
+            fixed (char* pText = &MemoryMarshal.GetReference(text))
+            {
+                return FindFirstCharacterToEncode(pText, text.Length);
+            }
+        }
+
+        /// <summary>
+        /// Given a UTF-8 text input buffer, finds the first element in the input buffer which would be
+        /// escaped by the current encoder instance.
+        /// </summary>
+        /// <param name="utf8Text">The UTF-8 text input buffer to search.</param>
+        /// <returns>
+        /// The index of the first element in <paramref name="utf8Text"/> which would be escaped by the
+        /// current encoder instance, or -1 if no data in <paramref name="utf8Text"/> requires escaping.
+        /// </returns>
+        [EditorBrowsable(EditorBrowsableState.Never)]
+        internal virtual int FindFirstCharacterToEncodeUtf8(ReadOnlySpan<byte> utf8Text)
+        {
+            int originalUtf8TextLength = utf8Text.Length;
+
+            // Loop through the input text, terminating when we see ill-formed UTF-8 or when we decode a scalar value
+            // that must be encoded. If we see either of these things then we'll return its index in the original
+            // input sequence. If we consume the entire text without seeing either of these, return -1 to indicate
+            // that the text can be copied as-is without escaping.
+
+            while (!utf8Text.IsEmpty)
+            {
+                if (UnicodeHelpers.DecodeScalarValueFromUtf8(utf8Text, out uint nextScalarValue, out int bytesConsumedThisIteration) != OperationStatus.Done
+                   || WillEncode((int)nextScalarValue))
+                {
+                    return originalUtf8TextLength - utf8Text.Length;
+                }
+
+                utf8Text = utf8Text.Slice(bytesConsumedThisIteration);
+            }
+
+            return -1; // no input data needs to be escaped
+        }
+
+        /// <summary>
+        /// Shim function which can call virtual method <see cref="FindFirstCharacterToEncodeUtf8"/> using fast dispatch.
+        /// </summary>
+        internal static int FindFirstCharacterToEncodeUtf8Shim(TextEncoder encoder, ReadOnlySpan<byte> utf8Text)
+        {
+            return encoder.FindFirstCharacterToEncodeUtf8(utf8Text);
+        }
+
         internal static unsafe bool TryCopyCharacters(char[] source, char* destination, int destinationLength, out int numberOfCharactersWritten)
         {
             Debug.Assert(source != null && destination != null && destinationLength >= 0);
diff --git a/src/libraries/System.Text.Encodings.Web/src/System/Text/Unicode/UnicodeHelpers.cs b/src/libraries/System.Text.Encodings.Web/src/System/Text/Unicode/UnicodeHelpers.cs
index aa66f92..55162cc 100644
--- a/src/libraries/System.Text.Encodings.Web/src/System/Text/Unicode/UnicodeHelpers.cs
+++ b/src/libraries/System.Text.Encodings.Web/src/System/Text/Unicode/UnicodeHelpers.cs
@@ -2,6 +2,7 @@
 // The .NET Foundation licenses this file to you under the MIT license.
 // See the LICENSE file in the project root for more information.
 
+using System.Buffers;
 using System.Buffers.Binary;
 using System.Diagnostics;
 using System.Runtime.CompilerServices;
@@ -47,6 +48,240 @@ namespace System.Text.Unicode
         }
 
         /// <summary>
+        /// A copy of the logic in Rune.DecodeFromUtf16.
+        /// </summary>
+        public static OperationStatus DecodeScalarValueFromUtf16(ReadOnlySpan<char> source, out uint result, out int charsConsumed)
+        {
+            const char ReplacementChar = '\uFFFD';
+
+            if (!source.IsEmpty)
+            {
+                // First, check for the common case of a BMP scalar value.
+                // If this is correct, return immediately.
+
+                uint firstChar = source[0];
+                if (!UnicodeUtility.IsSurrogateCodePoint(firstChar))
+                {
+                    result = firstChar;
+                    charsConsumed = 1;
+                    return OperationStatus.Done;
+                }
+
+                // First thing we saw was a UTF-16 surrogate code point.
+                // Let's optimistically assume for now it's a high surrogate and hope
+                // that combining it with the next char yields useful results.
+
+                if (1 < (uint)source.Length)
+                {
+                    uint secondChar = source[1];
+                    if (UnicodeUtility.IsHighSurrogateCodePoint(firstChar) && UnicodeUtility.IsLowSurrogateCodePoint(secondChar))
+                    {
+                        // Success! Formed a supplementary scalar value.
+                        result = UnicodeUtility.GetScalarFromUtf16SurrogatePair(firstChar, secondChar);
+                        charsConsumed = 2;
+                        return OperationStatus.Done;
+                    }
+                    else
+                    {
+                        // Either the first character was a low surrogate, or the second
+                        // character was not a low surrogate. This is an error.
+                        goto InvalidData;
+                    }
+                }
+                else if (!UnicodeUtility.IsHighSurrogateCodePoint(firstChar))
+                {
+                    // Quick check to make sure we're not going to report NeedMoreData for
+                    // a single-element buffer where the data is a standalone low surrogate
+                    // character. Since no additional data will ever make this valid, we'll
+                    // report an error immediately.
+                    goto InvalidData;
+                }
+            }
+
+            // If we got to this point, the input buffer was empty, or the buffer
+            // was a single element in length and that element was a high surrogate char.
+
+            charsConsumed = source.Length;
+            result = ReplacementChar;
+            return OperationStatus.NeedMoreData;
+
+        InvalidData:
+
+            charsConsumed = 1; // maximal invalid subsequence for UTF-16 is always a single code unit in length
+            result = ReplacementChar;
+            return OperationStatus.InvalidData;
+        }
+
+        /// <summary>
+        /// A copy of the logic in Rune.DecodeFromUtf8.
+        /// </summary>
+        public static OperationStatus DecodeScalarValueFromUtf8(ReadOnlySpan<byte> source, out uint result, out int bytesConsumed)
+        {
+            const char ReplacementChar = '\uFFFD';
+
+            // This method follows the Unicode Standard's recommendation for detecting
+            // the maximal subpart of an ill-formed subsequence. See The Unicode Standard,
+            // Ch. 3.9 for more details. In summary, when reporting an invalid subsequence,
+            // it tries to consume as many code units as possible as long as those code
+            // units constitute the beginning of a longer well-formed subsequence per Table 3-7.
+
+            int index = 0;
+
+            // Try reading input[0].
+
+            if ((uint)index >= (uint)source.Length)
+            {
+                goto NeedsMoreData;
+            }
+
+            uint tempValue = source[index];
+            if (!UnicodeUtility.IsAsciiCodePoint(tempValue))
+            {
+                goto NotAscii;
+            }
+
+        Finish:
+
+            bytesConsumed = index + 1;
+            Debug.Assert(1 <= bytesConsumed && bytesConsumed <= 4); // Valid subsequences are always length [1..4]
+            result = tempValue;
+            return OperationStatus.Done;
+
+        NotAscii:
+
+            // Per Table 3-7, the beginning of a multibyte sequence must be a code unit in
+            // the range [C2..F4]. If it's outside of that range, it's either a standalone
+            // continuation byte, or it's an overlong two-byte sequence, or it's an out-of-range
+            // four-byte sequence.
+
+            if (!UnicodeUtility.IsInRangeInclusive(tempValue, 0xC2, 0xF4))
+            {
+                goto FirstByteInvalid;
+            }
+
+            tempValue = (tempValue - 0xC2) << 6;
+
+            // Try reading input[1].
+
+            index++;
+            if ((uint)index >= (uint)source.Length)
+            {
+                goto NeedsMoreData;
+            }
+
+            // Continuation bytes are of the form [10xxxxxx], which means that their two's
+            // complement representation is in the range [-65..-128]. This allows us to
+            // perform a single comparison to see if a byte is a continuation byte.
+
+            int thisByteSignExtended = (sbyte)source[index];
+            if (thisByteSignExtended >= -64)
+            {
+                goto Invalid;
+            }
+
+            tempValue += (uint)thisByteSignExtended;
+            tempValue += 0x80; // remove the continuation byte marker
+            tempValue += (0xC2 - 0xC0) << 6; // remove the leading byte marker
+
+            if (tempValue < 0x0800)
+            {
+                Debug.Assert(UnicodeUtility.IsInRangeInclusive(tempValue, 0x0080, 0x07FF));
+                goto Finish; // this is a valid 2-byte sequence
+            }
+
+            // This appears to be a 3- or 4-byte sequence. Since per Table 3-7 we now have
+            // enough information (from just two code units) to detect overlong or surrogate
+            // sequences, we need to perform these checks now.
+
+            if (!UnicodeUtility.IsInRangeInclusive(tempValue, ((0xE0 - 0xC0) << 6) + (0xA0 - 0x80), ((0xF4 - 0xC0) << 6) + (0x8F - 0x80)))
+            {
+                // The first two bytes were not in the range [[E0 A0]..[F4 8F]].
+                // This is an overlong 3-byte sequence or an out-of-range 4-byte sequence.
+                goto Invalid;
+            }
+
+            if (UnicodeUtility.IsInRangeInclusive(tempValue, ((0xED - 0xC0) << 6) + (0xA0 - 0x80), ((0xED - 0xC0) << 6) + (0xBF - 0x80)))
+            {
+                // This is a UTF-16 surrogate code point, which is invalid in UTF-8.
+                goto Invalid;
+            }
+
+            if (UnicodeUtility.IsInRangeInclusive(tempValue, ((0xF0 - 0xC0) << 6) + (0x80 - 0x80), ((0xF0 - 0xC0) << 6) + (0x8F - 0x80)))
+            {
+                // This is an overlong 4-byte sequence.
+                goto Invalid;
+            }
+
+            // The first two bytes were just fine. We don't need to perform any other checks
+            // on the remaining bytes other than to see that they're valid continuation bytes.
+
+            // Try reading input[2].
+
+            index++;
+            if ((uint)index >= (uint)source.Length)
+            {
+                goto NeedsMoreData;
+            }
+
+            thisByteSignExtended = (sbyte)source[index];
+            if (thisByteSignExtended >= -64)
+            {
+                goto Invalid; // this byte is not a UTF-8 continuation byte
+            }
+
+            tempValue <<= 6;
+            tempValue += (uint)thisByteSignExtended;
+            tempValue += 0x80; // remove the continuation byte marker
+            tempValue -= (0xE0 - 0xC0) << 12; // remove the leading byte marker
+
+            if (tempValue <= 0xFFFF)
+            {
+                Debug.Assert(UnicodeUtility.IsInRangeInclusive(tempValue, 0x0800, 0xFFFF));
+                goto Finish; // this is a valid 3-byte sequence
+            }
+
+            // Try reading input[3].
+
+            index++;
+            if ((uint)index >= (uint)source.Length)
+            {
+                goto NeedsMoreData;
+            }
+
+            thisByteSignExtended = (sbyte)source[index];
+            if (thisByteSignExtended >= -64)
+            {
+                goto Invalid; // this byte is not a UTF-8 continuation byte
+            }
+
+            tempValue <<= 6;
+            tempValue += (uint)thisByteSignExtended;
+            tempValue += 0x80; // remove the continuation byte marker
+            tempValue -= (0xF0 - 0xE0) << 18; // remove the leading byte marker
+
+            UnicodeDebug.AssertIsValidSupplementaryPlaneScalar(tempValue);
+            goto Finish; // this is a valid 4-byte sequence
+
+        FirstByteInvalid:
+
+            index = 1; // Invalid subsequences are always at least length 1.
+
+        Invalid:
+
+            Debug.Assert(1 <= index && index <= 3); // Invalid subsequences are always length 1..3
+            bytesConsumed = index;
+            result = ReplacementChar;
+            return OperationStatus.InvalidData;
+
+        NeedsMoreData:
+
+            Debug.Assert(0 <= index && index <= 3); // Incomplete subsequences are always length 0..3
+            bytesConsumed = index;
+            result = ReplacementChar;
+            return OperationStatus.NeedMoreData;
+        }
+
+        /// <summary>
         /// Returns a bitmap of all characters which are defined per the checked-in version
         /// of the Unicode specification.
         /// </summary>
@@ -273,5 +508,24 @@ namespace System.Text.Unicode
         {
             return ((scalar & ~((int)char.MaxValue)) != 0);
         }
+
+        /// <summary>
+        /// Returns <see langword="true"/> iff <paramref name="value"/> is a UTF-8 continuation byte;
+        /// i.e., has binary representation 10xxxxxx, where x is any bit.
+        /// </summary>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        internal static bool IsUtf8ContinuationByte(in byte value)
+        {
+            // This API takes its input as a readonly ref so that the JIT can emit "cmp ModRM" statements
+            // directly rather than bounce a temporary through a register. That is, we want the JIT to be
+            // able to emit a single "cmp byte ptr [data], C0h" statement if we're querying a memory location
+            // to see if it's a continuation byte. Data that's already enregistered will go through the
+            // normal "cmp reg, C0h" code paths, perhaps with some extra unnecessary "movzx" instructions.
+            //
+            // The below check takes advantage of the two's complement representation of negative numbers.
+            // [ 0b1000_0000, 0b1011_1111 ] is [ -127 (sbyte.MinValue), -65 ]
+
+            return ((sbyte)value < -64);
+        }
     }
 }
diff --git a/src/libraries/System.Text.Encodings.Web/tests/CommonTestEncoder.cs b/src/libraries/System.Text.Encodings.Web/tests/CommonTestEncoder.cs
deleted file mode 100644
index 7b1ca90..0000000
--- a/src/libraries/System.Text.Encodings.Web/tests/CommonTestEncoder.cs
+++ /dev/null
@@ -1,105 +0,0 @@
-// Licensed to the .NET Foundation under one or more agreements.
-// The .NET Foundation licenses this file to you under the MIT license.
-// See the LICENSE file in the project root for more information.
-
-using System;
-using System.Globalization;
-using System.IO;
-using System.Runtime.CompilerServices;
-using System.Text.Encodings.Web;
-
-namespace Microsoft.Framework.WebEncoders
-{
-    /// <summary>
-    /// Dummy encoder used for unit testing.
-    /// </summary>
-    public sealed class CommonTestEncoder : IHtmlEncoder, IJavaScriptStringEncoder, IUrlEncoder
-    {
-        /// <summary>
-        /// Returns "HtmlEncode[[value]]".
-        /// </summary>
-        public string HtmlEncode(string value)
-        {
-            return EncodeCore(value);
-        }
-
-        /// <summary>
-        /// Writes "HtmlEncode[[value]]".
-        /// </summary>
-        public void HtmlEncode(string value, int startIndex, int characterCount, TextWriter output)
-        {
-            EncodeCore(value, startIndex, characterCount, output);
-        }
-
-        /// <summary>
-        /// Writes "HtmlEncode[[value]]".
-        /// </summary>
-        public void HtmlEncode(char[] value, int startIndex, int characterCount, TextWriter output)
-        {
-            EncodeCore(value, startIndex, characterCount, output);
-        }
-
-        /// <summary>
-        /// Returns "JavaScriptStringEncode[[value]]".
-        /// </summary>
-        public string JavaScriptStringEncode(string value)
-        {
-            return EncodeCore(value);
-        }
-
-        /// <summary>
-        /// Writes "JavaScriptStringEncode[[value]]".
-        /// </summary>
-        public void JavaScriptStringEncode(string value, int startIndex, int characterCount, TextWriter output)
-        {
-            EncodeCore(value, startIndex, characterCount, output);
-        }
-
-        /// <summary>
-        /// Writes "JavaScriptStringEncode[[value]]".
-        /// </summary>
-        public void JavaScriptStringEncode(char[] value, int startIndex, int characterCount, TextWriter output)
-        {
-            EncodeCore(value, startIndex, characterCount, output);
-        }
-
-        /// <summary>
-        /// Returns "UrlEncode[[value]]".
-        /// </summary>
-        public string UrlEncode(string value)
-        {
-            return EncodeCore(value);
-        }
-
-        /// <summary>
-        /// Writes "UrlEncode[[value]]".
-        /// </summary>
-        public void UrlEncode(string value, int startIndex, int characterCount, TextWriter output)
-        {
-            EncodeCore(value, startIndex, characterCount, output);
-        }
-
-        /// <summary>
-        /// Writes "UrlEncode[[value]]".
-        /// </summary>
-        public void UrlEncode(char[] value, int startIndex, int characterCount, TextWriter output)
-        {
-            EncodeCore(value, startIndex, characterCount, output);
-        }
-
-        private static string EncodeCore(string value, [CallerMemberName] string encodeType = null)
-        {
-            return string.Format(CultureInfo.InvariantCulture, "{0}[[{1}]]", encodeType, value);
-        }
-
-        private static void EncodeCore(string value, int startIndex, int characterCount, TextWriter output, [CallerMemberName] string encodeType = null)
-        {
-            output.Write(EncodeCore(value.Substring(startIndex, characterCount), encodeType));
-        }
-
-        private static void EncodeCore(char[] value, int startIndex, int characterCount, TextWriter output, [CallerMemberName] string encodeType = null)
-        {
-            output.Write(EncodeCore(new string(value, startIndex, characterCount), encodeType));
-        }
-    }
-}
diff --git a/src/libraries/System.Text.Encodings.Web/tests/ConfigurableScalarTextEncoder.cs b/src/libraries/System.Text.Encodings.Web/tests/ConfigurableScalarTextEncoder.cs
new file mode 100644
index 0000000..718ffcd
--- /dev/null
+++ b/src/libraries/System.Text.Encodings.Web/tests/ConfigurableScalarTextEncoder.cs
@@ -0,0 +1,38 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System;
+using System.Text.Encodings.Web;
+
+namespace Microsoft.Framework.WebEncoders
+{
+    /// <summary>
+    /// Dummy encoder used for unit testing.
+    /// </summary>
+    public sealed class ConfigurableScalarTextEncoder : TextEncoder
+    {
+        private readonly Predicate<int> _isScalarAllowed;
+
+        public ConfigurableScalarTextEncoder(Predicate<int> isScalarAllowed)
+        {
+            _isScalarAllowed = isScalarAllowed;
+        }
+
+        public override int MaxOutputCharactersPerInputCharacter => throw new NotImplementedException();
+
+        public override unsafe int FindFirstCharacterToEncode(char* text, int textLength) => throw new NotImplementedException();
+
+        public override bool WillEncode(int unicodeScalar) => !_isScalarAllowed(unicodeScalar);
+
+        /// <summary>
+        /// Encodes scalar as an unsigned hexadecimal number (min. 4 hex digits) surrounded by square brackets: "[XXXX]".
+        /// </summary>
+        public override unsafe bool TryEncodeUnicodeScalar(int unicodeScalar, char* buffer, int bufferLength, out int numberOfCharactersWritten)
+        {
+            string encoded = FormattableString.Invariant($"[{(uint)unicodeScalar:X4}]");
+            numberOfCharactersWritten = (encoded.Length <= (uint)bufferLength) ? encoded.Length : 0;
+            return encoded.AsSpan().TryCopyTo(new Span<char>(buffer, bufferLength));
+        }
+    }
+}
diff --git a/src/libraries/System.Text.Encodings.Web/tests/System.Text.Encodings.Web.Tests.csproj b/src/libraries/System.Text.Encodings.Web/tests/System.Text.Encodings.Web.Tests.csproj
index 565079d..a2e9feb 100644
--- a/src/libraries/System.Text.Encodings.Web/tests/System.Text.Encodings.Web.Tests.csproj
+++ b/src/libraries/System.Text.Encodings.Web/tests/System.Text.Encodings.Web.Tests.csproj
@@ -14,14 +14,15 @@
     </CodeAnalysisDependentAssemblyPaths>
   </ItemGroup>
   <ItemGroup>
-    <Compile Include="..\src\System\Text\Encodings\Web\HexUtil.cs" Link="HexUtil.cs" />
-    <Compile Include="..\src\System\Text\Internal\AllowedCharactersBitmap.cs" Link="AllowedCharactersBitmap.cs" />
-    <Compile Include="..\src\System\Text\Unicode\UnicodeHelpers.cs" Link="UnicodeHelpers.cs" />
-    <Compile Include="..\src\System\Text\Unicode\UnicodeHelpers.generated.cs" Link="UnicodeHelpers.generated.cs" />
+    <Compile Include="..\src\Common\TextEncoderExtensions.cs" />
+    <Compile Include="..\src\System\Text\Encodings\Web\HexUtil.cs" />
+    <Compile Include="..\src\System\Text\Internal\AllowedCharactersBitmap.cs" />
+    <Compile Include="..\src\System\Text\Unicode\UnicodeHelpers.cs" />
+    <Compile Include="..\src\System\Text\Unicode\UnicodeHelpers.generated.cs" />
     <Compile Include="AllowedCharsBitmapTests.cs" />
     <Compile Include="TextEncoderTests.cs" />
+    <Compile Include="ConfigurableScalarTextEncoder.cs" />
     <Compile Include="ScalarTestEncoder.cs" />
-    <Compile Include="CommonTestEncoder.cs" />
     <Compile Include="EncoderCommon.cs" />
     <Compile Include="EncoderCommonTests.cs" />
     <Compile Include="EncoderExtensionsTests.cs" />
@@ -45,6 +46,12 @@
     <Compile Include="UrlEncoderTests.cs" />
   </ItemGroup>
   <ItemGroup>
+    <Compile Include="$(CommonPath)\CoreLib\System\Text\UnicodeDebug.cs">
+      <Link>System\Text\UnicodeDebug.cs</Link>
+    </Compile>
+    <Compile Include="$(CommonPath)\CoreLib\System\Text\UnicodeUtility.cs">
+      <Link>System\Text\UnicodeUtility.cs</Link>
+    </Compile>
     <EmbeddedResource Include="$(CommonTestPath)\Data\UnicodeData.12.1.txt">
       <LogicalName>UnicodeData.12.1.txt</LogicalName>
     </EmbeddedResource>
diff --git a/src/libraries/System.Text.Encodings.Web/tests/TextEncoderTests.cs b/src/libraries/System.Text.Encodings.Web/tests/TextEncoderTests.cs
index ec4725e..c3345e2 100644
--- a/src/libraries/System.Text.Encodings.Web/tests/TextEncoderTests.cs
+++ b/src/libraries/System.Text.Encodings.Web/tests/TextEncoderTests.cs
@@ -3,10 +3,11 @@
 // See the LICENSE file in the project root for more information.
 
 using System;
-using System.Globalization;
-using System.IO;
+using System.Buffers;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
 using System.Text.Encodings.Web;
-using System.Text.Unicode;
 using Xunit;
 
 namespace Microsoft.Framework.WebEncoders
@@ -41,5 +42,240 @@ namespace Microsoft.Framework.WebEncoders
             Assert.Equal(eX + ePair + eX, encoder.Encode(X + Pair + X)); // two iterations, block, even length
             Assert.Equal(ePair + eX + ePair, encoder.Encode(Pair + X + Pair)); // three iterations, no block, odd length
         }
+
+        [Theory]
+        [InlineData(0, 0)]
+        [InlineData(1, 0)]
+        [InlineData(2, 2)]
+        [InlineData(3, 3)]
+        [InlineData(4, 3)]
+        [InlineData(5, 3)]
+        [InlineData(6, 6)]
+        [InlineData(7, 6)]
+        [InlineData(8, 6)]
+        [InlineData(9, 6)]
+        [InlineData(10, 10)]
+        [InlineData(11, 11)]
+        [InlineData(12, 11)]
+        public void EncodeUtf8_WellFormedInput_DoesNotRequireEncoding_CopiedToDestinationCorrectly(int destinationSize, int expectedBytesCopied)
+        {
+            // This test considers input which is well-formed and doesn't need to be encoded.
+            // If the destination buffer is large enough, the data should be copied in its entirety.
+            // If the destination buffer is too small, only complete UTF-8 subsequences should be copied.
+            // We should never copy a partial subsequence, as it would cause a future call to EncodeUtf8
+            // to misinterpret the data as ill-formed.
+
+            // Arrange
+
+            byte[] fullUtf8Input = new byte[] {
+                0xC2, 0x82,
+                0x40,
+                0xE2, 0x90, 0x91,
+                0xF3, 0xA0, 0xA1, 0xA2,
+                0x50 }; // UTF-8 subsequences of varying length
+
+            var encoder = new ConfigurableScalarTextEncoder(_ => true /* allow everything */);
+
+            // Act & assert
+
+            OperationStatus expectedOpStatus = (expectedBytesCopied == fullUtf8Input.Length) ? OperationStatus.Done : OperationStatus.DestinationTooSmall;
+
+            byte[] destination = new byte[destinationSize];
+            Assert.Equal(expectedOpStatus, encoder.EncodeUtf8(fullUtf8Input, destination, out int bytesConsumed, out int bytesWritten, isFinalBlock: true));
+            Assert.Equal(expectedBytesCopied, bytesConsumed);
+            Assert.Equal(expectedBytesCopied, bytesWritten); // bytes written should match bytes consumed if no encoding needs to take place
+            Assert.Equal(fullUtf8Input.AsSpan(0, bytesConsumed).ToArray(), destination.AsSpan(0, bytesWritten).ToArray()); // ensure byte-for-byte copy
+            Assert.True(destination.AsSpan(bytesWritten).ToArray().All(el => el == 0)); // all remaining bytes should be unchanged
+
+            destination = new byte[destinationSize];
+            Assert.Equal(expectedOpStatus, encoder.EncodeUtf8(fullUtf8Input, destination, out bytesConsumed, out bytesWritten, isFinalBlock: false));
+            Assert.Equal(expectedBytesCopied, bytesConsumed);
+            Assert.Equal(expectedBytesCopied, bytesWritten); // bytes written should match bytes consumed if no encoding needs to take place
+            Assert.Equal(fullUtf8Input.AsSpan(0, bytesConsumed).ToArray(), destination.AsSpan(0, bytesWritten).ToArray()); // ensure byte-for-byte copy
+            Assert.True(destination.AsSpan(bytesWritten).ToArray().All(el => el == 0)); // all remaining bytes should be unchanged
+        }
+
+        [Fact]
+        public void EncodeUtf8_MixedInputWhichRequiresEncodingOrReplacement()
+        {
+            // Arrange
+
+            var fullInput = new[]
+            {
+                new { utf8Bytes = new byte[] { 0x40 }, output = "@" },
+                new { utf8Bytes = new byte[] { 0xC3, 0x85 }, output = "[00C5]" }, // U+00C5 LATIN CAPITAL LETTER A WITH RING ABOVE (encoded since odd scalar value)
+                new { utf8Bytes = new byte[] { 0xC3, 0x86 }, output = "\u00C6" }, // U+00C6 LATIN CAPITAL LETTER AE (on allow list)
+                new { utf8Bytes = new byte[] { 0xFF }, output = "[FFFD]" }, // (invalid UTF-8, replaced with encoded form of U+FFFD)
+                new { utf8Bytes = new byte[] { 0xEF, 0xBF, 0xBD }, output = "[FFFD]" }, // U+FFFD REPLACEMENT CHARACTER (encoded since not on allow list)
+                new { utf8Bytes = new byte[] { 0xF0, 0x90, 0x82, 0x82 }, output = "\U00010082" }, // U+10082 LINEAR B IDEOGRAM B104 DEER (not encoded since on allow list)
+                new { utf8Bytes = new byte[] { 0xF0, 0x90, 0x82, 0x83 }, output = "[10083]" }, // U+10083 LINEAR B IDEOGRAM B105 EQUID (encoded since not on allow list)
+            };
+
+            var encoder = new ConfigurableScalarTextEncoder(scalarValue => (scalarValue % 2) == 0 /* allow only even-valued scalars to be represented unescaped */);
+
+            // Act & assert
+
+            List<byte> aggregateInputBytesSoFar = new List<byte>();
+            List<byte> expectedOutputBytesSoFar = new List<byte>();
+
+            foreach (var entry in fullInput)
+            {
+                int aggregateInputByteCountAtStartOfLoop = aggregateInputBytesSoFar.Count;
+
+                byte[] destination;
+                int bytesConsumed, bytesWritten;
+
+                for (int i = 0; i < entry.utf8Bytes.Length - 1; i++)
+                {
+                    aggregateInputBytesSoFar.Add(entry.utf8Bytes[i]);
+
+                    // If not final block, partial encoding should say "needs more data".
+                    // We'll try with various destination lengths just to make sure it doesn't affect result.
+
+                    foreach (int destinationLength in new[] { expectedOutputBytesSoFar.Count, expectedOutputBytesSoFar.Count + 1024 })
+                    {
+                        destination = new byte[destinationLength];
+
+                        Assert.Equal(OperationStatus.NeedMoreData, encoder.EncodeUtf8(aggregateInputBytesSoFar.ToArray(), destination, out bytesConsumed, out bytesWritten, isFinalBlock: false));
+                        Assert.Equal(aggregateInputByteCountAtStartOfLoop, bytesConsumed);
+                        Assert.Equal(expectedOutputBytesSoFar.Count, bytesWritten);
+                        Assert.Equal(expectedOutputBytesSoFar.ToArray(), new Span<byte>(destination, 0, expectedOutputBytesSoFar.Count).ToArray());
+                    }
+
+                    // Now try it with "isFinalBlock = true" to force the U+FFFD conversion
+
+                    destination = new byte[expectedOutputBytesSoFar.Count]; // first with not enough output space to write "[FFFD]"
+
+                    Assert.Equal(OperationStatus.DestinationTooSmall, encoder.EncodeUtf8(aggregateInputBytesSoFar.ToArray(), destination, out bytesConsumed, out bytesWritten, isFinalBlock: true));
+                    Assert.Equal(aggregateInputByteCountAtStartOfLoop, bytesConsumed);
+                    Assert.Equal(expectedOutputBytesSoFar.Count, bytesWritten);
+                    Assert.Equal(expectedOutputBytesSoFar.ToArray(), new Span<byte>(destination, 0, expectedOutputBytesSoFar.Count).ToArray());
+
+                    destination = new byte[expectedOutputBytesSoFar.Count + 1024]; // then with enough output space to write "[FFFD]"
+
+                    Assert.Equal(OperationStatus.Done, encoder.EncodeUtf8(aggregateInputBytesSoFar.ToArray(), destination, out bytesConsumed, out bytesWritten, isFinalBlock: true));
+                    Assert.Equal(aggregateInputBytesSoFar.Count, bytesConsumed);
+                    Assert.Equal(expectedOutputBytesSoFar.Count + "[FFFD]".Length, bytesWritten);
+                    Assert.Equal(expectedOutputBytesSoFar.Concat(Encoding.UTF8.GetBytes("[FFFD]")).ToArray(), new Span<byte>(destination, 0, expectedOutputBytesSoFar.Count + "[FFFD]".Length).ToArray());
+                }
+
+                // Consume the remainder of this entry and make sure it escaped properly (if needed).
+
+                aggregateInputBytesSoFar.Add(entry.utf8Bytes.Last());
+
+                // First with not enough space in the destination buffer.
+
+                destination = new byte[expectedOutputBytesSoFar.Count + Encoding.UTF8.GetByteCount(entry.output) - 1];
+
+                Assert.Equal(OperationStatus.DestinationTooSmall, encoder.EncodeUtf8(aggregateInputBytesSoFar.ToArray(), destination, out bytesConsumed, out bytesWritten, isFinalBlock: true));
+                Assert.Equal(aggregateInputByteCountAtStartOfLoop, bytesConsumed);
+                Assert.Equal(expectedOutputBytesSoFar.Count, bytesWritten);
+                Assert.Equal(expectedOutputBytesSoFar.ToArray(), new Span<byte>(destination, 0, expectedOutputBytesSoFar.Count).ToArray());
+
+                // Then with exactly enough space in the destination buffer,
+                // and again with more than enough space in the destination buffer.
+
+                expectedOutputBytesSoFar.AddRange(Encoding.UTF8.GetBytes(entry.output));
+
+                foreach (int destinationLength in new[] { expectedOutputBytesSoFar.Count, expectedOutputBytesSoFar.Count + 1024 })
+                {
+                    destination = new byte[destinationLength];
+
+                        Assert.Equal(OperationStatus.Done, encoder.EncodeUtf8(aggregateInputBytesSoFar.ToArray(), destination, out bytesConsumed, out bytesWritten, isFinalBlock: false));
+                    Assert.Equal(aggregateInputBytesSoFar.Count, bytesConsumed);
+                    Assert.Equal(expectedOutputBytesSoFar.Count, bytesWritten);
+                    Assert.Equal(expectedOutputBytesSoFar.ToArray(), new Span<byte>(destination, 0, expectedOutputBytesSoFar.Count).ToArray());
+                }
+            }
+        }
+
+        [Fact]
+        public void EncodeUtf8_EmptyInput_AlwaysSucceeds()
+        {
+            // Arrange
+
+            var encoder = new ConfigurableScalarTextEncoder(_ => false /* disallow everything */);
+
+            // Act & assert
+
+            Assert.Equal(OperationStatus.Done, encoder.EncodeUtf8(ReadOnlySpan<byte>.Empty, Span<byte>.Empty, out int bytesConsumed, out int bytesWritten, isFinalBlock: true));
+            Assert.Equal(0, bytesConsumed);
+            Assert.Equal(0, bytesWritten);
+
+            Assert.Equal(OperationStatus.Done, encoder.EncodeUtf8(ReadOnlySpan<byte>.Empty, Span<byte>.Empty, out bytesConsumed, out bytesWritten, isFinalBlock: false));
+            Assert.Equal(0, bytesConsumed);
+            Assert.Equal(0, bytesWritten);
+        }
+
+        [Fact]
+        public void FindFirstCharToEncodeUtf8_EmptyInput_ReturnsNegOne()
+        {
+            // Arrange
+
+            var encoder = new ConfigurableScalarTextEncoder(_ => false /* disallow everything */);
+
+            // Act
+
+            int idxOfFirstByteToEncode = encoder.FindFirstCharacterToEncodeUtf8(ReadOnlySpan<byte>.Empty);
+
+            // Assert
+
+            Assert.Equal(-1, idxOfFirstByteToEncode);
+        }
+
+        [Fact]
+        public void FindFirstCharToEncodeUtf8_WellFormedData_AllCharsAllowed()
+        {
+            // Arrange
+
+            byte[] inputBytes = Encoding.UTF8.GetBytes("\U00000040\U00000400\U00004000\U00040000"); // code units of different lengths
+            var encoder = new ConfigurableScalarTextEncoder(_ => true /* allow everything */);
+
+            // Act
+
+            int idxOfFirstByteToEncode = encoder.FindFirstCharacterToEncodeUtf8(inputBytes);
+
+            // Assert
+
+            Assert.Equal(-1, idxOfFirstByteToEncode);
+        }
+
+        [Fact]
+        public void FindFirstCharToEncodeUtf8_WellFormedData_SomeCharsDisallowed()
+        {
+            // Arrange
+
+            byte[] inputBytes = Encoding.UTF8.GetBytes("\U00000040\U00000400\U00004000\U00040000"); // code units of different lengths
+            var encoder = new ConfigurableScalarTextEncoder(codePoint => codePoint != 0x4000 /* disallow U+4000, allow all else */);
+
+            // Act
+
+            int idxOfFirstByteToEncode = encoder.FindFirstCharacterToEncodeUtf8(inputBytes);
+
+            // Assert
+
+            Assert.Equal(3, idxOfFirstByteToEncode);
+        }
+
+        [Theory]
+        [InlineData(new byte[] { 0x00, 0xC0, 0x80, 0x80 }, 1)]
+        [InlineData(new byte[] { 0x00, 0xC2, 0x80, 0x80 }, 3)]
+        [InlineData(new byte[] { 0xF1, 0x80, 0x80 }, 0)]
+        [InlineData(new byte[] { 0xF1, 0x80, 0x80, 0x80, 0xFF }, 4)]
+        [InlineData(new byte[] { 0xFF, 0x80, 0x80, 0x80, 0xFF }, 0)]
+        public void FindFirstCharToEncodeUtf8_IllFormedData_ReturnsIndexOfIllFormedSubsequence(byte[] utf8Data, int expectedIndex)
+        {
+            // Arrange
+
+            var encoder = new ConfigurableScalarTextEncoder(_ => true /* allow everything */);
+
+            // Act
+
+            int actualIndex = encoder.FindFirstCharacterToEncodeUtf8(utf8Data);
+
+            // Assert
+
+            Assert.Equal(expectedIndex, actualIndex);
+        }
     }
 }
-- 
2.7.4