[master] Update dependencies from dotnet/coreclr (dotnet/corefx#36816)
authordotnet-maestro[bot] <dotnet-maestro[bot]@users.noreply.github.com>
Sat, 13 Apr 2019 18:53:57 +0000 (11:53 -0700)
committerLevi Broderick <GrabYourPitchforks@users.noreply.github.com>
Sat, 13 Apr 2019 18:53:57 +0000 (11:53 -0700)
- Microsoft.NET.Sdk.IL - 3.0.0-preview5-27612-73
- Microsoft.NETCore.ILAsm - 3.0.0-preview5-27612-73
- Microsoft.NETCore.Runtime.CoreCLR - 3.0.0-preview5-27612-73
- Also react to changes in the UTF-8 transcoding logic (dotnet/corefx#36712)

Commit migrated from https://github.com/dotnet/corefx/commit/4c3d4a083488b9f89676926a8b75aa5f7572f67f

src/libraries/System.Reflection.Metadata/tests/Metadata/Ecma335/MetadataBuilderTests.cs
src/libraries/System.Reflection.Metadata/tests/Metadata/Ecma335/MetadataRootBuilderTests.cs
src/libraries/System.Runtime/tests/System.Runtime.Tests.csproj
src/libraries/System.Runtime/tests/System/Text/Unicode/Utf16UtilityTests.ValidateChars.netcoreapp.cs [new file with mode: 0644]
src/libraries/System.Runtime/tests/System/Text/Unicode/Utf8Tests.ToBytes.netcoreapp.cs
src/libraries/System.Runtime/tests/System/Text/Unicode/Utf8Tests.ToChars.netcoreapp.cs
src/libraries/System.Runtime/tests/System/Text/Unicode/Utf8Tests.netcoreapp.cs
src/libraries/System.Runtime/tests/System/Text/Unicode/Utf8UtilityTests.ValidateBytes.netcoreapp.cs [new file with mode: 0644]
src/libraries/System.Text.Encoding/tests/NegativeEncodingTests.cs
src/libraries/System.Text.Encoding/tests/UTF8Encoding/UTF8EncodingDecode.cs
src/libraries/System.Text.Encoding/tests/UTF8Encoding/UTF8EncodingEncode.cs

index 6665314..4c20e9d 100644 (file)
@@ -492,7 +492,16 @@ namespace System.Reflection.Metadata.Ecma335.Tests
                 Assert.Equal(@"a/", mdReader.GetString(MetadataTokens.DocumentNameBlobHandle(MetadataTokens.GetHeapOffset(n6))));
                 Assert.Equal(@"/", mdReader.GetString(MetadataTokens.DocumentNameBlobHandle(MetadataTokens.GetHeapOffset(n7))));
                 Assert.Equal(@"\\", mdReader.GetString(MetadataTokens.DocumentNameBlobHandle(MetadataTokens.GetHeapOffset(n8))));
-                Assert.Equal("\uFFFd\uFFFd", mdReader.GetString(MetadataTokens.DocumentNameBlobHandle(MetadataTokens.GetHeapOffset(n9))));
+                if (PlatformDetection.IsNetCore)
+                {
+                    Assert.Equal("\uFFFD\uFFFD\uFFFD", mdReader.GetString(MetadataTokens.DocumentNameBlobHandle(MetadataTokens.GetHeapOffset(n9))));
+                }
+                else
+                {
+                    // Versions of .NET prior to Core 3.0 didn't follow Unicode recommendations for U+FFFD substitution,
+                    // so they sometimes emitted too few replacement chars.
+                    Assert.Equal("\uFFFD\uFFFD", mdReader.GetString(MetadataTokens.DocumentNameBlobHandle(MetadataTokens.GetHeapOffset(n9))));
+                }
                 Assert.Equal("\0", mdReader.GetString(MetadataTokens.DocumentNameBlobHandle(MetadataTokens.GetHeapOffset(n10))));
             }
         }
index b6c05bc..0922031 100644 (file)
@@ -377,11 +377,24 @@ namespace System.Reflection.Metadata.Ecma335.Tests
                 0x08, 0x00, 0x00, 0x00,
 
                 // padded version:
+                // [ E1 88 B4 ] -> U+1234
+                // [ ED ] -> invalid (ED cannot be followed by A0) -> U+FFFD
+                // [ A0 ] -> invalid (not ASCII, not valid leading byte) -> U+FFFD
+                // [ 80 ] -> invalid (not ASCII, not valid leading byte) -> U+FFFD
                 0xE1, 0x88, 0xB4, 0xED, 0xA0, 0x80, 0x00, 0x00,
             }, builder.Slice(12, -132));
 
             // the default decoder replaces bad byte sequences by U+FFFD
-            Assert.Equal("\u1234\ufffd\ufffd", ReadVersion(builder));
+            if (PlatformDetection.IsNetCore)
+            {
+                Assert.Equal("\u1234\ufffd\ufffd\ufffd", ReadVersion(builder));
+            }
+            else
+            {
+                // Versions of .NET prior to Core 3.0 didn't follow Unicode recommendations for U+FFFD substitution,
+                // so they sometimes emitted too few replacement chars.
+                Assert.Equal("\u1234\ufffd\ufffd", ReadVersion(builder));
+            }
         }
     }
 }
index 4b97b78..812406b 100644 (file)
     <Compile Include="System\Text\RuneTests.netcoreapp.cs" />
     <Compile Include="System\Text\RuneTests.TestData.netcoreapp.cs" />
     <Compile Include="System\Text\StringBuilderTests.netcoreapp.cs" />
+    <Compile Include="System\Text\Unicode\Utf16UtilityTests.ValidateChars.netcoreapp.cs" />
     <Compile Include="System\Text\Unicode\Utf8Tests.netcoreapp.cs" />
     <Compile Include="System\Text\Unicode\Utf8Tests.ToBytes.netcoreapp.cs" />
     <Compile Include="System\Text\Unicode\Utf8Tests.ToChars.netcoreapp.cs" />
+    <Compile Include="System\Text\Unicode\Utf8UtilityTests.ValidateBytes.netcoreapp.cs" />
     <Compile Include="System\Type\TypePropertyTests.netcoreapp.cs" />
     <Compile Include="System\Type\TypeTests.netcoreapp.cs" />
     <Compile Include="System\ArgIteratorTests.netcoreapp.cs" />
   <ItemGroup>
     <Service Include="{82A7F48D-3B50-4B1E-B82E-3ADA8210C358}" />
   </ItemGroup>
-</Project>
+</Project>
\ No newline at end of file
diff --git a/src/libraries/System.Runtime/tests/System/Text/Unicode/Utf16UtilityTests.ValidateChars.netcoreapp.cs b/src/libraries/System.Runtime/tests/System/Text/Unicode/Utf16UtilityTests.ValidateChars.netcoreapp.cs
new file mode 100644 (file)
index 0000000..fd87b57
--- /dev/null
@@ -0,0 +1,255 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System.Buffers;
+using System.Globalization;
+using System.Linq;
+using System.Reflection;
+using System.Runtime.InteropServices;
+using Xunit;
+
+namespace System.Text.Unicode.Tests
+{
+    public partial class Utf16UtilityTests
+    {
+        private unsafe delegate char* GetPointerToFirstInvalidCharDel(char* pInputBuffer, int inputLength, out long utf8CodeUnitCountAdjustment, out int scalarCountAdjustment);
+        private static readonly Lazy<GetPointerToFirstInvalidCharDel> _getPointerToFirstInvalidCharFn = CreateGetPointerToFirstInvalidCharFn();
+
+        [Theory]
+        [InlineData("", 0, 0)] // empty string is OK
+        [InlineData("X", 1, 1)]
+        [InlineData("XY", 2, 2)]
+        [InlineData("XYZ", 3, 3)]
+        [InlineData("<EACU>", 1, 2)]
+        [InlineData("X<EACU>", 2, 3)]
+        [InlineData("<EACU>X", 2, 3)]
+        [InlineData("<EURO>", 1, 3)]
+        [InlineData("<GRIN>", 1, 4)]
+        [InlineData("X<GRIN>Z", 3, 6)]
+        [InlineData("X<0000>Z", 3, 3)] // null chars are allowed
+        public void GetIndexOfFirstInvalidUtf16Sequence_WithSmallValidBuffers(string unprocessedInput, int expectedRuneCount, int expectedUtf8ByteCount)
+        {
+            GetIndexOfFirstInvalidUtf16Sequence_Test_Core(unprocessedInput, -1 /* expectedIdxOfFirstInvalidChar */, expectedRuneCount, expectedUtf8ByteCount);
+        }
+
+        [Theory]
+        [InlineData("<DC00>", 0, 0, 0)] // standalone low surrogate (at beginning of sequence)
+        [InlineData("X<DC00>", 1, 1, 1)] // standalone low surrogate (preceded by valid ASCII data)
+        [InlineData("<EURO><DC00>", 1, 1, 3)] // standalone low surrogate (preceded by valid non-ASCII data)
+        [InlineData("<D800>", 0, 0, 0)] // standalone high surrogate (missing follow-up low surrogate)
+        [InlineData("<D800>Y", 0, 0, 0)] // standalone high surrogate (followed by ASCII char)
+        [InlineData("<D800><D800>", 0, 0, 0)] // standalone high surrogate (followed by high surrogate)
+        [InlineData("<D800><EURO>", 0, 0, 0)] // standalone high surrogate (followed by valid non-ASCII char)
+        [InlineData("<DC00><DC00>", 0, 0, 0)] // standalone low surrogate (not preceded by a high surrogate)
+        [InlineData("<DC00><D800>", 0, 0, 0)] // standalone low surrogate (not preceded by a high surrogate)
+        [InlineData("<GRIN><DC00><DC00>", 2, 1, 4)] // standalone low surrogate (preceded by a valid surrogate pair)
+        [InlineData("<GRIN><DC00><D800>", 2, 1, 4)] // standalone low surrogate (preceded by a valid surrogate pair)
+        [InlineData("<GRIN><0000><DC00><D800>", 3, 2, 5)] // standalone low surrogate (preceded by a valid null char)
+        public void GetIndexOfFirstInvalidUtf16Sequence_WithSmallInvalidBuffers(string unprocessedInput, int idxOfFirstInvalidChar, int expectedRuneCount, int expectedUtf8ByteCount)
+        {
+            GetIndexOfFirstInvalidUtf16Sequence_Test_Core(unprocessedInput, idxOfFirstInvalidChar, expectedRuneCount, expectedUtf8ByteCount);
+        }
+
+        [Fact]
+        public void GetIndexOfFirstInvalidUtf16Sequence_WithInvalidSurrogateSequences()
+        {
+            // All ASCII
+
+            char[] chars = Enumerable.Repeat('x', 128).ToArray();
+            GetIndexOfFirstInvalidUtf16Sequence_Test_Core(chars, -1, expectedRuneCount: 128, expectedUtf8ByteCount: 128);
+
+            // Throw a surrogate pair at the beginning
+
+            chars[0] = '\uD800';
+            chars[1] = '\uDFFF';
+            GetIndexOfFirstInvalidUtf16Sequence_Test_Core(chars, -1, expectedRuneCount: 127, expectedUtf8ByteCount: 130);
+
+            // Throw a surrogate pair near the end
+
+            chars[124] = '\uD800';
+            chars[125] = '\uDFFF';
+            GetIndexOfFirstInvalidUtf16Sequence_Test_Core(chars, -1, expectedRuneCount: 126, expectedUtf8ByteCount: 132);
+
+            // Throw a standalone surrogate code point at the *very* end
+
+            chars[127] = '\uD800'; // high surrogate
+            GetIndexOfFirstInvalidUtf16Sequence_Test_Core(chars, 127, expectedRuneCount: 125, expectedUtf8ByteCount: 131);
+
+            chars[127] = '\uDFFF'; // low surrogate
+            GetIndexOfFirstInvalidUtf16Sequence_Test_Core(chars, 127, expectedRuneCount: 125, expectedUtf8ByteCount: 131);
+
+            // Make the final surrogate pair valid
+
+            chars[126] = '\uD800'; // high surrogate
+            GetIndexOfFirstInvalidUtf16Sequence_Test_Core(chars, -1, expectedRuneCount: 125, expectedUtf8ByteCount: 134);
+
+            // Throw an invalid surrogate sequence in the middle (straddles a vector boundary)
+
+            chars[12] = '\u0080'; // 2-byte UTF-8 sequence
+            chars[13] = '\uD800'; // high surrogate
+            chars[14] = '\uD800'; // high surrogate
+            chars[15] = '\uDFFF'; // low surrogate
+            chars[16] = '\uDFFF'; // low surrogate
+            GetIndexOfFirstInvalidUtf16Sequence_Test_Core(chars, 13, expectedRuneCount: 12, expectedUtf8ByteCount: 16);
+
+            // Correct the surrogate sequence we just added
+
+            chars[14] = '\uDC00'; // low surrogate
+            chars[15] = '\uDBFF'; // high surrogate
+            GetIndexOfFirstInvalidUtf16Sequence_Test_Core(chars, -1, expectedRuneCount: 123, expectedUtf8ByteCount: 139);
+
+            // Corrupt the surrogate pair that's split across a vector boundary
+
+            chars[16] = 'x'; // ASCII char (remember.. chars[15] is a high surrogate char)
+            GetIndexOfFirstInvalidUtf16Sequence_Test_Core(chars, 15, expectedRuneCount: 13, expectedUtf8ByteCount: 20);
+        }
+
+        private static void GetIndexOfFirstInvalidUtf16Sequence_Test_Core(string unprocessedInput, int expectedIdxOfFirstInvalidChar, int expectedRuneCount, long expectedUtf8ByteCount)
+        {
+            char[] processedInput = ProcessInput(unprocessedInput).ToCharArray();
+
+            // Run the test normally
+
+            GetIndexOfFirstInvalidUtf16Sequence_Test_Core(processedInput, expectedIdxOfFirstInvalidChar, expectedRuneCount, expectedUtf8ByteCount);
+
+            // Put a bunch of ASCII data at the beginning (to test the call to ASCIIUtility at method entry)
+
+            processedInput = Enumerable.Repeat('x', 128).Concat(processedInput).ToArray();
+
+            if (expectedIdxOfFirstInvalidChar >= 0)
+            {
+                expectedIdxOfFirstInvalidChar += 128;
+            }
+            expectedRuneCount += 128;
+            expectedUtf8ByteCount += 128;
+
+            GetIndexOfFirstInvalidUtf16Sequence_Test_Core(processedInput, expectedIdxOfFirstInvalidChar, expectedRuneCount, expectedUtf8ByteCount);
+
+            // Change the first few chars to a mixture of 2-byte and 3-byte UTF-8 sequences
+            // This makes sure the vectorized code paths can properly handle these.
+
+            processedInput[0] = '\u0080'; // 2-byte UTF-8 sequence
+            processedInput[1] = '\u0800'; // 3-byte UTF-8 sequence
+            processedInput[2] = '\u0080'; // 2-byte UTF-8 sequence
+            processedInput[3] = '\u0800'; // 3-byte UTF-8 sequence
+            processedInput[4] = '\u0080'; // 2-byte UTF-8 sequence
+            processedInput[5] = '\u0800'; // 3-byte UTF-8 sequence
+            processedInput[6] = '\u0080'; // 2-byte UTF-8 sequence
+            processedInput[7] = '\u0800'; // 3-byte UTF-8 sequence
+
+            expectedUtf8ByteCount += 12;
+
+            GetIndexOfFirstInvalidUtf16Sequence_Test_Core(processedInput, expectedIdxOfFirstInvalidChar, expectedRuneCount, expectedUtf8ByteCount);
+
+            // Throw some surrogate pairs into the mix to make sure they're also handled properly
+            // by the vectorized code paths.
+
+            processedInput[8] = '\u0080'; // 2-byte UTF-8 sequence
+            processedInput[9] = '\u0800'; // 3-byte UTF-8 sequence
+            processedInput[10] = '\u0080'; // 2-byte UTF-8 sequence
+            processedInput[11] = '\u0800'; // 3-byte UTF-8 sequence
+            processedInput[12] = '\u0080'; // 2-byte UTF-8 sequence
+            processedInput[13] = '\uD800'; // high surrogate
+            processedInput[14] = '\uDC00'; // low surrogate
+            processedInput[15] = 'z'; // ASCII char
+
+            expectedRuneCount--;
+            expectedUtf8ByteCount += 9;
+
+            GetIndexOfFirstInvalidUtf16Sequence_Test_Core(processedInput, expectedIdxOfFirstInvalidChar, expectedRuneCount, expectedUtf8ByteCount);
+
+            // Split the next surrogate pair across the vector boundary (so that we
+            // don't inadvertently treat this as a standalone surrogate sequence).
+
+            processedInput[15] = '\uDBFF'; // high surrogate
+            processedInput[16] = '\uDFFF'; // low surrogate
+
+            expectedRuneCount--;
+            expectedUtf8ByteCount += 2;
+
+            GetIndexOfFirstInvalidUtf16Sequence_Test_Core(processedInput, expectedIdxOfFirstInvalidChar, expectedRuneCount, expectedUtf8ByteCount);
+        }
+
+        private static unsafe void GetIndexOfFirstInvalidUtf16Sequence_Test_Core(char[] input, int expectedRetVal, int expectedRuneCount, long expectedUtf8ByteCount)
+        {
+            // Arrange
+
+            using BoundedMemory<char> boundedMemory = BoundedMemory.AllocateFromExistingData(input);
+            boundedMemory.MakeReadonly();
+
+            // Act
+
+            int actualRetVal;
+            long actualUtf8CodeUnitCount;
+            int actualRuneCount;
+
+            fixed (char* pInputBuffer = &MemoryMarshal.GetReference(boundedMemory.Span))
+            {
+                char* pFirstInvalidChar = _getPointerToFirstInvalidCharFn.Value(pInputBuffer, input.Length, out long utf8CodeUnitCountAdjustment, out int scalarCountAdjustment);
+
+                long ptrDiff = pFirstInvalidChar - pInputBuffer;
+                Assert.True((ulong)ptrDiff <= (uint)input.Length, "ptrDiff was outside expected range.");
+
+                Assert.True(utf8CodeUnitCountAdjustment >= 0, "UTF-16 code unit count adjustment must be non-negative.");
+                Assert.True(scalarCountAdjustment <= 0, "Scalar count adjustment must be 0 or negative.");
+
+                actualRetVal = (ptrDiff == input.Length) ? -1 : (int)ptrDiff;
+
+                // The last two 'out' parameters are:
+                // a) The number to be added to the "chars processed" return value to come up with the total UTF-8 code unit count, and
+                // b) The number to be added to the "total UTF-16 code unit count" value to come up with the total scalar count.
+
+                actualUtf8CodeUnitCount = ptrDiff + utf8CodeUnitCountAdjustment;
+                actualRuneCount = (int)ptrDiff + scalarCountAdjustment;
+            }
+
+            // Assert
+
+            Assert.Equal(expectedRetVal, actualRetVal);
+            Assert.Equal(expectedRuneCount, actualRuneCount);
+            Assert.Equal(actualUtf8CodeUnitCount, expectedUtf8ByteCount);
+        }
+
+        private static Lazy<GetPointerToFirstInvalidCharDel> CreateGetPointerToFirstInvalidCharFn()
+        {
+            return new Lazy<GetPointerToFirstInvalidCharDel>(() =>
+            {
+                Type utf16UtilityType = typeof(Utf8).Assembly.GetType("System.Text.Unicode.Utf16Utility");
+
+                if (utf16UtilityType is null)
+                {
+                    throw new Exception("Couldn't find Utf16Utility type in System.Private.CoreLib.");
+                }
+
+                MethodInfo methodInfo = utf16UtilityType.GetMethod("GetPointerToFirstInvalidChar", BindingFlags.Static | BindingFlags.Public | BindingFlags.NonPublic);
+
+                if (methodInfo is null)
+                {
+                    throw new Exception("Couldn't find GetPointerToFirstInvalidChar method on Utf8Utility.");
+                }
+
+                return (GetPointerToFirstInvalidCharDel)methodInfo.CreateDelegate(typeof(GetPointerToFirstInvalidCharDel));
+            });
+        }
+
+        private static string ProcessInput(string input)
+        {
+            input = input.Replace("<EACU>", "\u00E9", StringComparison.Ordinal); // U+00E9 LATIN SMALL LETTER E WITH ACUTE
+            input = input.Replace("<EURO>", "\u20AC", StringComparison.Ordinal); // U+20AC EURO SIGN
+            input = input.Replace("<GRIN>", "\U0001F600", StringComparison.Ordinal); //  U+1F600 GRINNING FACE
+
+            // Replace <ABCD> with \uABCD. This allows us to flow potentially malformed
+            // UTF-16 strings without Xunit. (The unit testing framework gets angry when
+            // we try putting invalid UTF-16 data as inline test data.)
+
+            int idx;
+            while ((idx = input.IndexOf('<')) >= 0)
+            {
+                input = input[..idx] + (char)ushort.Parse(input.Substring(idx + 1, 4), NumberStyles.AllowHexSpecifier, CultureInfo.InvariantCulture) + input[idx + 6..];
+            }
+
+            return input;
+        }
+    }
+}
index 18ceedc..5432da0 100644 (file)
@@ -119,6 +119,34 @@ namespace System.Text.Unicode.Tests
                    expectedNumCharsRead: expectedNumCharsConsumed,
                    expectedUtf8Transcoding: concatenatedUtf8);
             }
+
+            // now throw lots of ASCII data at the beginning so that we exercise the vectorized code paths
+
+            utf16Input = new string('x', 64) + utf16Input;
+            concatenatedUtf8 = utf16Input.EnumerateRunes().SelectMany(ToUtf8).ToArray();
+
+            ToBytes_Test_Core(
+                utf16Input: utf16Input,
+                destinationSize: concatenatedUtf8.Length,
+                replaceInvalidSequences: false,
+                isFinalChunk: true,
+                expectedOperationStatus: OperationStatus.Done,
+                expectedNumCharsRead: utf16Input.Length,
+                expectedUtf8Transcoding: concatenatedUtf8);
+
+            // now throw some non-ASCII data at the beginning so that we *don't* exercise the vectorized code paths
+
+            utf16Input = WOMAN_CARTWHEELING_MEDSKIN_UTF16 + utf16Input[64..];
+            concatenatedUtf8 = utf16Input.EnumerateRunes().SelectMany(ToUtf8).ToArray();
+
+            ToBytes_Test_Core(
+                utf16Input: utf16Input,
+                destinationSize: concatenatedUtf8.Length,
+                replaceInvalidSequences: false,
+                isFinalChunk: true,
+                expectedOperationStatus: OperationStatus.Done,
+                expectedNumCharsRead: utf16Input.Length,
+                expectedUtf8Transcoding: concatenatedUtf8);
         }
 
         [Theory]
@@ -162,6 +190,18 @@ namespace System.Text.Unicode.Tests
                 expectedOperationStatus: OperationStatus.InvalidData,
                 expectedNumCharsRead: expectedNumCharsConsumed,
                 expectedUtf8Transcoding: DecodeHex(expectedUtf8TranscodingHex));
+
+            // Now try the tests again with a larger buffer.
+            // This ensures that running out of destination space wasn't the reason we failed.
+
+            ToBytes_Test_Core(
+                utf16Input: utf16Input,
+                destinationSize: (expectedUtf8TranscodingHex.Length) / 2 + 16,
+                replaceInvalidSequences: false,
+                isFinalChunk: false,
+                expectedOperationStatus: OperationStatus.InvalidData,
+                expectedNumCharsRead: expectedNumCharsConsumed,
+                expectedUtf8Transcoding: DecodeHex(expectedUtf8TranscodingHex));
         }
 
         [Theory]
index 6dda95d..cb39338 100644 (file)
@@ -42,6 +42,18 @@ namespace System.Text.Unicode.Tests
               expectedOperationStatus: OperationStatus.InvalidData,
               expectedNumBytesRead: expectedNumBytesConsumed,
               expectedUtf16Transcoding: expectedUtf16Transcoding);
+
+            // Now try the tests again with a larger buffer.
+            // This ensures that running out of destination space wasn't the reason we failed.
+
+            ToChars_Test_Core(
+              utf8Input: DecodeHex(utf8HexInput),
+              destinationSize: expectedUtf16Transcoding.Length + 16,
+              replaceInvalidSequences: false,
+              isFinalChunk: false,
+              expectedOperationStatus: OperationStatus.InvalidData,
+              expectedNumBytesRead: expectedNumBytesConsumed,
+              expectedUtf16Transcoding: expectedUtf16Transcoding);
         }
 
         [Theory]
@@ -74,6 +86,18 @@ namespace System.Text.Unicode.Tests
               expectedOperationStatus: OperationStatus.NeedMoreData,
               expectedNumBytesRead: expectedNumBytesConsumed,
               expectedUtf16Transcoding: expectedUtf16Transcoding);
+
+            // Now try the tests again with a larger buffer.
+            // This ensures that running out of destination space wasn't the reason we failed.
+
+            ToChars_Test_Core(
+             utf8Input: DecodeHex(utf8HexInput),
+             destinationSize: expectedUtf16Transcoding.Length + 16,
+             replaceInvalidSequences: false,
+             isFinalChunk: false,
+             expectedOperationStatus: OperationStatus.NeedMoreData,
+             expectedNumBytesRead: expectedNumBytesConsumed,
+             expectedUtf16Transcoding: expectedUtf16Transcoding);
         }
 
         [Theory]
@@ -104,7 +128,7 @@ namespace System.Text.Unicode.Tests
         [InlineData(EURO_SYMBOL_UTF16 + EURO_SYMBOL_UTF16 + E_ACUTE_UTF16 + E_ACUTE_UTF16 + E_ACUTE_UTF16 + E_ACUTE_UTF16)] // 2x 3-byte sequences + 4x 2-byte sequences, exercises "consume multiple bytes at a time" logic in 3-byte sequence processing
         [InlineData(GRINNING_FACE_UTF16 + GRINNING_FACE_UTF16)] // 2x 4-byte sequences, exercises 4-byte sequence processing
         [InlineData(GRINNING_FACE_UTF16 + "@AB")] // single 4-byte sequence + 3 ASCII bytes, exercises 4-byte sequence processing and draining logic
-        [InlineData("\U0001F938\U0001F3FD\u200D\u2640\uFE0F")] // U+1F938 U+1F3FD U+200D U+2640 U+FE0F WOMAN CARTWHEELING: MEDIUM SKIN TONE, exercising switching between multiple sequence lengths
+        [InlineData(WOMAN_CARTWHEELING_MEDSKIN_UTF16)] // exercises switching between multiple sequence lengths
         public void ToChars_ValidBuffers(string utf16Input)
         {
             // We're going to run the tests with destination buffer lengths ranging from 0 all the way
@@ -162,6 +186,34 @@ namespace System.Text.Unicode.Tests
                     expectedNumBytesRead: expectedNumBytesConsumed,
                     expectedUtf16Transcoding: concatenatedUtf16);
             }
+
+            // now throw lots of ASCII data at the beginning so that we exercise the vectorized code paths
+
+            utf16Input = new string('x', 64) + utf16Input;
+            utf8Input = utf16Input.EnumerateRunes().SelectMany(ToUtf8).ToArray();
+
+            ToChars_Test_Core(
+                utf8Input: utf8Input,
+                destinationSize: utf16Input.Length,
+                replaceInvalidSequences: false,
+                isFinalChunk: true,
+                expectedOperationStatus: OperationStatus.Done,
+                expectedNumBytesRead: utf8Input.Length,
+                expectedUtf16Transcoding: utf16Input);
+
+            // now throw some non-ASCII data at the beginning so that we *don't* exercise the vectorized code paths
+
+            utf16Input = WOMAN_CARTWHEELING_MEDSKIN_UTF16 + utf16Input[64..];
+            utf8Input = utf16Input.EnumerateRunes().SelectMany(ToUtf8).ToArray();
+
+            ToChars_Test_Core(
+                utf8Input: utf8Input,
+                destinationSize: utf16Input.Length,
+                replaceInvalidSequences: false,
+                isFinalChunk: true,
+                expectedOperationStatus: OperationStatus.Done,
+                expectedNumBytesRead: utf8Input.Length,
+                expectedUtf16Transcoding: utf16Input);
         }
 
         [Theory]
@@ -182,6 +234,7 @@ namespace System.Text.Unicode.Tests
         [InlineData("3031" + "E17F80" + EURO_SYMBOL_UTF8 + EURO_SYMBOL_UTF8, 2, "01")] // Improperly terminated 3-byte sequence at start of DWORD
         [InlineData("3031" + "E1C080" + EURO_SYMBOL_UTF8 + EURO_SYMBOL_UTF8, 2, "01")] // Improperly terminated 3-byte sequence at start of DWORD
         [InlineData("3031" + "EDA080" + EURO_SYMBOL_UTF8 + EURO_SYMBOL_UTF8, 2, "01")] // Surrogate 3-byte sequence at start of DWORD
+        [InlineData("3031" + "E69C88" + "E59B" + "E69C88", 5, "01\u6708")] // Incomplete 3-byte sequence surrounded by valid 3-byte sequences
         [InlineData("3031" + "F5808080", 2, "01")] // [ F5 ] is always invalid
         [InlineData("3031" + "F6808080", 2, "01")] // [ F6 ] is always invalid
         [InlineData("3031" + "F7808080", 2, "01")] // [ F7 ] is always invalid
@@ -208,6 +261,18 @@ namespace System.Text.Unicode.Tests
                 expectedOperationStatus: OperationStatus.InvalidData,
                 expectedNumBytesRead: expectedNumBytesConsumed,
                 expectedUtf16Transcoding: expectedUtf16Transcoding);
+
+            // Now try the tests again with a larger buffer.
+            // This ensures that running out of destination space wasn't the reason we failed.
+
+            ToChars_Test_Core(
+                utf8Input: DecodeHex(utf8HexInput),
+                destinationSize: expectedUtf16Transcoding.Length + 16,
+                replaceInvalidSequences: false,
+                isFinalChunk: false,
+                expectedOperationStatus: OperationStatus.InvalidData,
+                expectedNumBytesRead: expectedNumBytesConsumed,
+                expectedUtf16Transcoding: expectedUtf16Transcoding);
         }
 
         [Theory]
index 087235a..f57c769 100644 (file)
@@ -33,7 +33,9 @@ namespace System.Text.Unicode.Tests
 
         private const string GRINNING_FACE_UTF8 = "F09F9880"; // U+1F600 GRINNING FACE, 4 bytes
         private const string GRINNING_FACE_UTF16 = "\U0001F600";
-        
+
+        private const string WOMAN_CARTWHEELING_MEDSKIN_UTF16 = "\U0001F938\U0001F3FD\u200D\u2640\uFE0F"; // U+1F938 U+1F3FD U+200D U+2640 U+FE0F WOMAN CARTWHEELING: MEDIUM SKIN TONE
+
         // All valid scalars [ U+0000 .. U+D7FF ] and [ U+E000 .. U+10FFFF ].
         private static readonly IEnumerable<Rune> s_allValidScalars = Enumerable.Range(0x0000, 0xD800).Concat(Enumerable.Range(0xE000, 0x110000 - 0xE000)).Select(value => new Rune(value));
 
@@ -59,7 +61,7 @@ namespace System.Text.Unicode.Tests
          * COMMON UTILITIES FOR UNIT TESTS
          */
 
-        private static byte[] DecodeHex(ReadOnlySpan<char> inputHex)
+        public static byte[] DecodeHex(ReadOnlySpan<char> inputHex)
         {
             Assert.True(Regex.IsMatch(inputHex.ToString(), "^([0-9a-fA-F]{2})*$"), "Input must be an even number of hex characters.");
 
@@ -74,7 +76,7 @@ namespace System.Text.Unicode.Tests
         // !! IMPORTANT !!
         // Don't delete this implementation, as we use it as a reference to make sure the framework's
         // transcoding logic is correct.
-        private static byte[] ToUtf8(Rune rune)
+        public static byte[] ToUtf8(Rune rune)
         {
             Assert.True(Rune.IsValid(rune.Value), $"Rune with value U+{(uint)rune.Value:X4} is not well-formed.");
 
diff --git a/src/libraries/System.Runtime/tests/System/Text/Unicode/Utf8UtilityTests.ValidateBytes.netcoreapp.cs b/src/libraries/System.Runtime/tests/System/Text/Unicode/Utf8UtilityTests.ValidateBytes.netcoreapp.cs
new file mode 100644 (file)
index 0000000..899faa8
--- /dev/null
@@ -0,0 +1,417 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System.Buffers;
+using System.Linq;
+using System.Reflection;
+using System.Runtime.InteropServices;
+using Xunit;
+
+namespace System.Text.Unicode.Tests
+{
+    public partial class Utf8UtilityTests
+    {
+        private unsafe delegate byte* GetPointerToFirstInvalidByteDel(byte* pInputBuffer, int inputLength, out int utf16CodeUnitCountAdjustment, out int scalarCountAdjustment);
+        private static readonly Lazy<GetPointerToFirstInvalidByteDel> _getPointerToFirstInvalidByteFn = CreateGetPointerToFirstInvalidByteFn();
+
+        private const string X = "58"; // U+0058 LATIN CAPITAL LETTER X, 1 byte
+        private const string Y = "59"; // U+0058 LATIN CAPITAL LETTER Y, 1 byte
+        private const string Z = "5A"; // U+0058 LATIN CAPITAL LETTER Z, 1 byte
+        private const string E_ACUTE = "C3A9"; // U+00E9 LATIN SMALL LETTER E WITH ACUTE, 2 bytes
+        private const string EURO_SYMBOL = "E282AC"; // U+20AC EURO SIGN, 3 bytes
+        private const string GRINNING_FACE = "F09F9880"; // U+1F600 GRINNING FACE, 4 bytes
+
+        [Theory]
+        [InlineData("", 0, 0)] // empty string is OK
+        [InlineData(X, 1, 0)]
+        [InlineData(X + Y, 2, 0)]
+        [InlineData(X + Y + Z, 3, 0)]
+        [InlineData(E_ACUTE, 1, 0)]
+        [InlineData(X + E_ACUTE, 2, 0)]
+        [InlineData(E_ACUTE + X, 2, 0)]
+        [InlineData(EURO_SYMBOL, 1, 0)]
+        public void GetIndexOfFirstInvalidUtf8Sequence_WithSmallValidBuffers(string input, int expectedRuneCount, int expectedSurrogatePairCount)
+        {
+            // These test cases are for the "slow processing" code path at the end of GetIndexOfFirstInvalidUtf8Sequence,
+            // so inputs should be less than 4 bytes.
+
+            Assert.InRange(input.Length, 0, 6);
+
+            GetIndexOfFirstInvalidUtf8Sequence_Test_Core(input, -1 /* expectedRetVal */, expectedRuneCount, expectedSurrogatePairCount);
+        }
+
+        [Theory]
+        [InlineData("80", 0, 0, 0)] // sequence cannot begin with continuation character
+        [InlineData("8182", 0, 0, 0)] // sequence cannot begin with continuation character
+        [InlineData("838485", 0, 0, 0)] // sequence cannot begin with continuation character
+        [InlineData(X + "80", 1, 1, 0)] // sequence cannot begin with continuation character
+        [InlineData(X + "8182", 1, 1, 0)] // sequence cannot begin with continuation character
+        [InlineData("C0", 0, 0, 0)] // [ C0 ] is always invalid
+        [InlineData("C080", 0, 0, 0)] // [ C0 ] is always invalid
+        [InlineData("C08081", 0, 0, 0)] // [ C0 ] is always invalid
+        [InlineData(X + "C1", 1, 1, 0)] // [ C1 ] is always invalid
+        [InlineData(X + "C180", 1, 1, 0)] // [ C1 ] is always invalid
+        [InlineData("C2", 0, 0, 0)] // [ C2 ] is improperly terminated
+        [InlineData(X + "C27F", 1, 1, 0)] // [ C2 ] is improperly terminated
+        [InlineData(X + "E282", 1, 1, 0)] // [ E2 82 ] is improperly terminated
+        [InlineData("E2827F", 0, 0, 0)] // [ E2 82 ] is improperly terminated
+        [InlineData("E09F80", 0, 0, 0)] // [ E0 9F ... ] is overlong
+        [InlineData("E0C080", 0, 0, 0)] // [ E0 ] is improperly terminated
+        [InlineData("ED7F80", 0, 0, 0)] // [ ED ] is improperly terminated
+        [InlineData("EDA080", 0, 0, 0)] // [ ED A0 ... ] is surrogate
+        public void GetIndexOfFirstInvalidUtf8Sequence_WithSmallInvalidBuffers(string input, int expectedRetVal, int expectedRuneCount, int expectedSurrogatePairCount)
+        {
+            // These test cases are for the "slow processing" code path at the end of GetIndexOfFirstInvalidUtf8Sequence,
+            // so inputs should be less than 4 bytes.
+
+            Assert.InRange(input.Length, 0, 6);
+
+            GetIndexOfFirstInvalidUtf8Sequence_Test_Core(input, expectedRetVal, expectedRuneCount, expectedSurrogatePairCount);
+        }
+
+        [Theory]
+        [InlineData(E_ACUTE + "21222324" + "303132333435363738393A3B3C3D3E3F", 21, 0)] // Loop unrolling at end of buffer
+        [InlineData(E_ACUTE + "21222324" + "303132333435363738393A3B3C3D3E3F" + "3031323334353637" + E_ACUTE + "38393A3B3C3D3E3F", 38, 0)] // Loop unrolling interrupted by non-ASCII
+        [InlineData("212223" + E_ACUTE + "30313233", 8, 0)] // 3 ASCII bytes followed by non-ASCII
+        [InlineData("2122" + E_ACUTE + "30313233", 7, 0)] // 2 ASCII bytes followed by non-ASCII
+        [InlineData("21" + E_ACUTE + "30313233", 6, 0)] // 1 ASCII byte followed by non-ASCII
+        [InlineData(E_ACUTE + E_ACUTE + E_ACUTE + E_ACUTE, 4, 0)] // 4x 2-byte sequences, exercises optimization code path in 2-byte sequence processing
+        [InlineData(E_ACUTE + E_ACUTE + E_ACUTE + "5051", 5, 0)] // 3x 2-byte sequences + 2 ASCII bytes, exercises optimization code path in 2-byte sequence processing
+        [InlineData(E_ACUTE + "5051", 3, 0)] // single 2-byte sequence + 2 trailing ASCII bytes, exercises draining logic in 2-byte sequence processing
+        [InlineData(E_ACUTE + "50" + E_ACUTE + "304050", 6, 0)] // single 2-byte sequences + 1 trailing ASCII byte + 2-byte sequence, exercises draining logic in 2-byte sequence processing
+        [InlineData(EURO_SYMBOL + "20", 2, 0)] // single 3-byte sequence + 1 trailing ASCII byte, exercises draining logic in 3-byte sequence processing
+        [InlineData(EURO_SYMBOL + "203040", 4, 0)] // single 3-byte sequence + 3 trailing ASCII byte, exercises draining logic and "running out of data" logic in 3-byte sequence processing
+        [InlineData(EURO_SYMBOL + EURO_SYMBOL + EURO_SYMBOL, 3, 0)] // 3x 3-byte sequences, exercises "stay within 3-byte loop" logic in 3-byte sequence processing
+        [InlineData(EURO_SYMBOL + EURO_SYMBOL + EURO_SYMBOL + EURO_SYMBOL, 4, 0)] // 4x 3-byte sequences, exercises "consume multiple bytes at a time" logic in 3-byte sequence processing
+        [InlineData(EURO_SYMBOL + EURO_SYMBOL + EURO_SYMBOL + E_ACUTE, 4, 0)] // 3x 3-byte sequences + single 2-byte sequence, exercises "consume multiple bytes at a time" logic in 3-byte sequence processing
+        [InlineData(EURO_SYMBOL + EURO_SYMBOL + E_ACUTE + E_ACUTE + E_ACUTE + E_ACUTE, 6, 0)] // 2x 3-byte sequences + 4x 2-byte sequences, exercises "consume multiple bytes at a time" logic in 3-byte sequence processing
+        [InlineData(GRINNING_FACE + GRINNING_FACE, 2, 2)] // 2x 4-byte sequences, exercises 4-byte sequence processing
+        [InlineData(GRINNING_FACE + "303132", 4, 1)] // single 4-byte sequence + 3 ASCII bytes, exercises 4-byte sequence processing and draining logic
+        [InlineData("F09FA4B8" + "F09F8FBD" + "E2808D" + "E29980" + "EFB88F", 5, 2)] // U+1F938 U+1F3FD U+200D U+2640 U+FE0F WOMAN CARTWHEELING: MEDIUM SKIN TONE, exercising switching between multiple sequence lengths
+        public void GetIndexOfFirstInvalidUtf8Sequence_WithLargeValidBuffers(string input, int expectedRuneCount, int expectedSurrogatePairCount)
+        {
+            // These test cases are for the "fast processing" code which is the main loop of GetIndexOfFirstInvalidUtf8Sequence,
+            // so inputs should be less >= 4 bytes.
+
+            Assert.True(input.Length >= 8);
+
+            GetIndexOfFirstInvalidUtf8Sequence_Test_Core(input, -1 /* expectedRetVal */, expectedRuneCount, expectedSurrogatePairCount);
+        }
+
+        [Theory]
+        [InlineData("3031" + "80" + "202122232425", 2, 2, 0)] // Continuation character at start of sequence should match no bitmask
+        [InlineData("3031" + "C080" + "2021222324", 2, 2, 0)] // Overlong 2-byte sequence at start of DWORD
+        [InlineData("3031" + "C180" + "2021222324", 2, 2, 0)] // Overlong 2-byte sequence at start of DWORD
+        [InlineData("C280" + "C180", 2, 1, 0)] // Overlong 2-byte sequence at end of DWORD
+        [InlineData("C27F" + "C280", 0, 0, 0)] // Improperly terminated 2-byte sequence at start of DWORD
+        [InlineData("C2C0" + "C280", 0, 0, 0)] // Improperly terminated 2-byte sequence at start of DWORD
+        [InlineData("C280" + "C27F", 2, 1, 0)] // Improperly terminated 2-byte sequence at end of DWORD
+        [InlineData("C280" + "C2C0", 2, 1, 0)] // Improperly terminated 2-byte sequence at end of DWORD
+        [InlineData("C280" + "C280" + "80203040", 4, 2, 0)] // Continuation character at start of sequence, within "stay in 2-byte processing" optimization
+        [InlineData("C280" + "C280" + "C180" + "C280", 4, 2, 0)] // Overlong 2-byte sequence at start of DWORD, within "stay in 2-byte processing" optimization
+        [InlineData("C280" + "C280" + "C280" + "C180", 6, 3, 0)] // Overlong 2-byte sequence at end of DWORD, within "stay in 2-byte processing" optimization
+        [InlineData("3031" + "E09F80" + EURO_SYMBOL + EURO_SYMBOL, 2, 2, 0)] // Overlong 3-byte sequence at start of DWORD
+        [InlineData("3031" + "E07F80" + EURO_SYMBOL + EURO_SYMBOL, 2, 2, 0)] // Improperly terminated 3-byte sequence at start of DWORD
+        [InlineData("3031" + "E0C080" + EURO_SYMBOL + EURO_SYMBOL, 2, 2, 0)] // Improperly terminated 3-byte sequence at start of DWORD
+        [InlineData("3031" + "E17F80" + EURO_SYMBOL + EURO_SYMBOL, 2, 2, 0)] // Improperly terminated 3-byte sequence at start of DWORD
+        [InlineData("3031" + "E1C080" + EURO_SYMBOL + EURO_SYMBOL, 2, 2, 0)] // Improperly terminated 3-byte sequence at start of DWORD
+        [InlineData("3031" + "EDA080" + EURO_SYMBOL + EURO_SYMBOL, 2, 2, 0)] // Surrogate 3-byte sequence at start of DWORD
+        [InlineData("3031" + "E69C88" + "E59B" + "E69C88", 5, 3, 0)] // Incomplete 3-byte sequence surrounded by valid 3-byte sequences
+        [InlineData("3031" + "F5808080", 2, 2, 0)] // [ F5 ] is always invalid
+        [InlineData("3031" + "F6808080", 2, 2, 0)] // [ F6 ] is always invalid
+        [InlineData("3031" + "F7808080", 2, 2, 0)] // [ F7 ] is always invalid
+        [InlineData("3031" + "F8808080", 2, 2, 0)] // [ F8 ] is always invalid
+        [InlineData("3031" + "F9808080", 2, 2, 0)] // [ F9 ] is always invalid
+        [InlineData("3031" + "FA808080", 2, 2, 0)] // [ FA ] is always invalid
+        [InlineData("3031" + "FB808080", 2, 2, 0)] // [ FB ] is always invalid
+        [InlineData("3031" + "FC808080", 2, 2, 0)] // [ FC ] is always invalid
+        [InlineData("3031" + "FD808080", 2, 2, 0)] // [ FD ] is always invalid
+        [InlineData("3031" + "FE808080", 2, 2, 0)] // [ FE ] is always invalid
+        [InlineData("3031" + "FF808080", 2, 2, 0)] // [ FF ] is always invalid
+        public void GetIndexOfFirstInvalidUtf8Sequence_WithLargeInvalidBuffers(string input, int expectedRetVal, int expectedRuneCount, int expectedSurrogatePairCount)
+        {
+            // These test cases are for the "fast processing" code which is the main loop of GetIndexOfFirstInvalidUtf8Sequence,
+            // so inputs should be less >= 4 bytes.
+
+            Assert.True(input.Length >= 8);
+
+            GetIndexOfFirstInvalidUtf8Sequence_Test_Core(input, expectedRetVal, expectedRuneCount, expectedSurrogatePairCount);
+        }
+
+        [Fact]
+        public void GetIndexOfFirstInvalidUtf8Sequence_WithOverlongTwoByteSequences_ReturnsInvalid()
+        {
+            // [ C0 ] is never a valid byte, indicates overlong 2-byte sequence
+            // We'll test that [ C0 ] [ 00..FF ] is treated as invalid
+
+            for (int i = 0; i < 256; i++)
+            {
+                AssertIsInvalidTwoByteSequence(new byte[] { 0xC0, (byte)i });
+            }
+
+            // [ C1 ] is never a valid byte, indicates overlong 2-byte sequence
+            // We'll test that [ C1 ] [ 00..FF ] is treated as invalid
+
+            for (int i = 0; i < 256; i++)
+            {
+                AssertIsInvalidTwoByteSequence(new byte[] { 0xC1, (byte)i });
+            }
+        }
+
+        [Fact]
+        public void GetIndexOfFirstInvalidUtf8Sequence_WithImproperlyTerminatedTwoByteSequences_ReturnsInvalid()
+        {
+            // Test [ C2..DF ] [ 00..7F ] and [ C2..DF ] [ C0..FF ]
+
+            for (int i = 0xC2; i < 0xDF; i++)
+            {
+                for (int j = 0; j < 0x80; j++)
+                {
+                    AssertIsInvalidTwoByteSequence(new byte[] { (byte)i, (byte)j });
+                }
+                for (int j = 0xC0; j < 0x100; j++)
+                {
+                    AssertIsInvalidTwoByteSequence(new byte[] { (byte)i, (byte)j });
+                }
+            }
+        }
+
+        [Fact]
+        public void GetIndexOfFirstInvalidUtf8Sequence_WithOverlongThreeByteSequences_ReturnsInvalid()
+        {
+            // [ E0 ] [ 80..9F ] [ 80..BF ] is overlong 3-byte sequence
+
+            for (int i = 0x00; i < 0xA0; i++)
+            {
+                AssertIsInvalidThreeByteSequence(new byte[] { 0xE0, (byte)i, 0x80 });
+            }
+        }
+
+        [Fact]
+        public void GetIndexOfFirstInvalidUtf8Sequence_WithSurrogateThreeByteSequences_ReturnsInvalid()
+        {
+            // [ ED ] [ A0..BF ] [ 80..BF ] is surrogate 3-byte sequence
+
+            for (int i = 0xA0; i < 0x100; i++)
+            {
+                AssertIsInvalidThreeByteSequence(new byte[] { 0xED, (byte)i, 0x80 });
+            }
+        }
+
+        [Fact]
+        public void GetIndexOfFirstInvalidUtf8Sequence_WithImproperlyTerminatedThreeByteSequence_ReturnsInvalid()
+        {
+            // [ E0..EF ] [ 80..BF ] [ !(80..BF) ] is improperly terminated 3-byte sequence
+
+            for (int i = 0xE0; i < 0xF0; i++)
+            {
+                for (int j = 0x00; j < 0x80; j++)
+                {
+                    // Use both '9F' and 'A0' to make sure at least one isn't caught by overlong / surrogate checks
+                    AssertIsInvalidThreeByteSequence(new byte[] { (byte)i, 0x9F, (byte)j });
+                    AssertIsInvalidThreeByteSequence(new byte[] { (byte)i, 0xA0, (byte)j });
+                }
+                for (int j = 0xC0; j < 0x100; j++)
+                {
+                    // Use both '9F' and 'A0' to make sure at least one isn't caught by overlong / surrogate checks
+                    AssertIsInvalidThreeByteSequence(new byte[] { (byte)i, 0x9F, (byte)j });
+                    AssertIsInvalidThreeByteSequence(new byte[] { (byte)i, 0xA0, (byte)j });
+                }
+            }
+        }
+
+        [Fact]
+        public void GetIndexOfFirstInvalidUtf8Sequence_WithOverlongFourByteSequences_ReturnsInvalid()
+        {
+            // [ F0 ] [ 80..8F ] [ 80..BF ] [ 80..BF ] is overlong 4-byte sequence
+
+            for (int i = 0x00; i < 0x90; i++)
+            {
+                AssertIsInvalidFourByteSequence(new byte[] { 0xF0, (byte)i, 0x80, 0x80 });
+            }
+        }
+
+        [Fact]
+        public void GetIndexOfFirstInvalidUtf8Sequence_WithOutOfRangeFourByteSequences_ReturnsInvalid()
+        {
+            // [ F4 ] [ 90..BF ] [ 80..BF ] [ 80..BF ] is out-of-range 4-byte sequence
+
+            for (int i = 0x90; i < 0x100; i++)
+            {
+                AssertIsInvalidFourByteSequence(new byte[] { 0xF4, (byte)i, 0x80, 0x80 });
+            }
+        }
+
+        [Fact]
+        public void GetIndexOfFirstInvalidUtf8Sequence_WithInvalidFourByteSequence_ReturnsInvalid()
+        {
+            // [ F0..F4 ] [ !(80..BF) ] [ !(80..BF) ] [ !(80..BF) ] is improperly terminated 4-byte sequence
+
+            for (int i = 0xF0; i < 0xF5; i++)
+            {
+                for (int j = 0x00; j < 0x80; j++)
+                {
+                    AssertIsInvalidFourByteSequence(new byte[] { (byte)i, (byte)j, 0x80, 0x80 });
+
+                    // Use both '8F' and '90' to make sure at least one isn't caught by overlong / out-of-range checks
+                    AssertIsInvalidFourByteSequence(new byte[] { (byte)i, 0x9F, (byte)j, 0x80 });
+                    AssertIsInvalidFourByteSequence(new byte[] { (byte)i, 0xA0, (byte)j, 0x80 });
+
+                    AssertIsInvalidFourByteSequence(new byte[] { (byte)i, 0x9F, 0x80, (byte)j });
+                    AssertIsInvalidFourByteSequence(new byte[] { (byte)i, 0xA0, 0x80, (byte)j });
+                }
+                for (int j = 0xC0; j < 0x100; j++)
+                {
+                    AssertIsInvalidFourByteSequence(new byte[] { (byte)i, (byte)j, 0x80, 0x80 });
+
+                    // Use both '8F' and '90' to make sure at least one isn't caught by overlong / out-of-range checks
+                    AssertIsInvalidFourByteSequence(new byte[] { (byte)i, 0x9F, (byte)j, 0x80 });
+                    AssertIsInvalidFourByteSequence(new byte[] { (byte)i, 0xA0, (byte)j, 0x80 });
+
+                    AssertIsInvalidFourByteSequence(new byte[] { (byte)i, 0x9F, 0x80, (byte)j });
+                    AssertIsInvalidFourByteSequence(new byte[] { (byte)i, 0xA0, 0x80, (byte)j });
+                }
+            }
+        }
+
+        private static void AssertIsInvalidTwoByteSequence(byte[] invalidSequence)
+        {
+            Assert.Equal(2, invalidSequence.Length);
+
+            byte[] knownGoodBytes = Utf8Tests.DecodeHex(E_ACUTE);
+
+            byte[] toTest = invalidSequence.Concat(invalidSequence).Concat(knownGoodBytes).ToArray(); // at start of first DWORD
+            GetIndexOfFirstInvalidUtf8Sequence_Test_Core(toTest, 0, 0, 0);
+
+            toTest = knownGoodBytes.Concat(invalidSequence).Concat(knownGoodBytes).ToArray(); // at end of first DWORD
+            GetIndexOfFirstInvalidUtf8Sequence_Test_Core(toTest, 2, 1, 0);
+
+            // Run the same tests but with extra data at the beginning so that we're inside one of
+            // the 2-byte processing "hot loop" code paths.
+
+            toTest = knownGoodBytes.Concat(knownGoodBytes).Concat(invalidSequence).Concat(knownGoodBytes).ToArray(); // at start of next DWORD
+            GetIndexOfFirstInvalidUtf8Sequence_Test_Core(toTest, 4, 2, 0);
+
+            toTest = knownGoodBytes.Concat(knownGoodBytes).Concat(knownGoodBytes).Concat(invalidSequence).Concat(knownGoodBytes).ToArray(); // at end of next DWORD
+            GetIndexOfFirstInvalidUtf8Sequence_Test_Core(toTest, 6, 3, 0);
+        }
+
+        private static void AssertIsInvalidThreeByteSequence(byte[] invalidSequence)
+        {
+            Assert.Equal(3, invalidSequence.Length);
+
+            byte[] knownGoodBytes = Utf8Tests.DecodeHex(EURO_SYMBOL);
+
+            byte[] toTest = invalidSequence.Concat(invalidSequence).Concat(knownGoodBytes).ToArray(); // at start of first DWORD
+            GetIndexOfFirstInvalidUtf8Sequence_Test_Core(toTest, 0, 0, 0);
+
+            // Run the same tests but with extra data at the beginning so that we're inside one of
+            // the 3-byte processing "hot loop" code paths.
+
+            toTest = knownGoodBytes.Concat(invalidSequence).Concat(knownGoodBytes).ToArray(); // straddling first and second DWORDs
+            GetIndexOfFirstInvalidUtf8Sequence_Test_Core(toTest, 3, 1, 0);
+
+            toTest = knownGoodBytes.Concat(knownGoodBytes).Concat(invalidSequence).Concat(knownGoodBytes).ToArray(); // straddling second and third DWORDs
+            GetIndexOfFirstInvalidUtf8Sequence_Test_Core(toTest, 6, 2, 0);
+
+            toTest = knownGoodBytes.Concat(knownGoodBytes).Concat(knownGoodBytes).Concat(invalidSequence).Concat(knownGoodBytes).ToArray(); // at end of third DWORD
+            GetIndexOfFirstInvalidUtf8Sequence_Test_Core(toTest, 9, 3, 0);
+        }
+
+        private static void AssertIsInvalidFourByteSequence(byte[] invalidSequence)
+        {
+            Assert.Equal(4, invalidSequence.Length);
+
+            byte[] knownGoodBytes = Utf8Tests.DecodeHex(GRINNING_FACE);
+
+            byte[] toTest = invalidSequence.Concat(invalidSequence).Concat(knownGoodBytes).ToArray();
+            GetIndexOfFirstInvalidUtf8Sequence_Test_Core(toTest, 0, 0, 0);
+
+            toTest = knownGoodBytes.Concat(invalidSequence).Concat(knownGoodBytes).ToArray();
+            GetIndexOfFirstInvalidUtf8Sequence_Test_Core(toTest, 4, 1, 1);
+        }
+
+        private static void GetIndexOfFirstInvalidUtf8Sequence_Test_Core(string inputHex, int expectedRetVal, int expectedRuneCount, int expectedSurrogatePairCount)
+        {
+            byte[] inputBytes = Utf8Tests.DecodeHex(inputHex);
+
+            // Run the test normally
+            GetIndexOfFirstInvalidUtf8Sequence_Test_Core(inputBytes, expectedRetVal, expectedRuneCount, expectedSurrogatePairCount);
+
+            // Then run the test with a bunch of ASCII data at the beginning (to exercise the vectorized code paths)
+            inputBytes = Enumerable.Repeat((byte)'x', 128).Concat(inputBytes).ToArray();
+            GetIndexOfFirstInvalidUtf8Sequence_Test_Core(inputBytes, (expectedRetVal < 0) ? expectedRetVal : (expectedRetVal + 128), expectedRuneCount + 128, expectedSurrogatePairCount);
+
+            // Then put a few more ASCII bytes at the beginning (to test that offsets are properly handled)
+            inputBytes = Enumerable.Repeat((byte)'x', 7).Concat(inputBytes).ToArray();
+            GetIndexOfFirstInvalidUtf8Sequence_Test_Core(inputBytes, (expectedRetVal < 0) ? expectedRetVal : (expectedRetVal + 135), expectedRuneCount + 135, expectedSurrogatePairCount);
+        }
+
+        private static unsafe void GetIndexOfFirstInvalidUtf8Sequence_Test_Core(byte[] input, int expectedRetVal, int expectedRuneCount, int expectedSurrogatePairCount)
+        {
+            // Arrange
+
+            using BoundedMemory<byte> boundedMemory = BoundedMemory.AllocateFromExistingData(input);
+            boundedMemory.MakeReadonly();
+
+            // Act
+
+            int actualRetVal;
+            int actualSurrogatePairCount;
+            int actualRuneCount;
+
+            fixed (byte* pInputBuffer = &MemoryMarshal.GetReference(boundedMemory.Span))
+            {
+                byte* pFirstInvalidByte = _getPointerToFirstInvalidByteFn.Value(pInputBuffer, input.Length, out int utf16CodeUnitCountAdjustment, out int scalarCountAdjustment);
+
+                long ptrDiff = pFirstInvalidByte - pInputBuffer;
+                Assert.True((ulong)ptrDiff <= (uint)input.Length, "ptrDiff was outside expected range.");
+
+                Assert.True(utf16CodeUnitCountAdjustment <= 0, "UTF-16 code unit count adjustment must be 0 or negative.");
+                Assert.True(scalarCountAdjustment <= 0, "Scalar count adjustment must be 0 or negative.");
+
+                actualRetVal = (ptrDiff == input.Length) ? -1 : (int)ptrDiff;
+
+                // The last two 'out' parameters are:
+                // a) The number to be added to the "bytes processed" return value to come up with the total UTF-16 code unit count, and
+                // b) The number to be added to the "total UTF-16 code unit count" value to come up with the total scalar count.
+
+                int totalUtf16CodeUnitCount = (int)ptrDiff + utf16CodeUnitCountAdjustment;
+                actualRuneCount = totalUtf16CodeUnitCount + scalarCountAdjustment;
+
+                // Surrogate pair count is number of UTF-16 code units less the number of scalars.
+
+                actualSurrogatePairCount = totalUtf16CodeUnitCount - actualRuneCount;
+            }
+
+            // Assert
+
+            Assert.Equal(expectedRetVal, actualRetVal);
+            Assert.Equal(expectedRuneCount, actualRuneCount);
+            Assert.Equal(expectedSurrogatePairCount, actualSurrogatePairCount);
+        }
+
+        private static Lazy<GetPointerToFirstInvalidByteDel> CreateGetPointerToFirstInvalidByteFn()
+        {
+            return new Lazy<GetPointerToFirstInvalidByteDel>(() =>
+            {
+                Type utf8UtilityType = typeof(Utf8).Assembly.GetType("System.Text.Unicode.Utf8Utility");
+
+                if (utf8UtilityType is null)
+                {
+                    throw new Exception("Couldn't find Utf8Utility type in System.Private.CoreLib.");
+                }
+
+                MethodInfo methodInfo = utf8UtilityType.GetMethod("GetPointerToFirstInvalidByte", BindingFlags.Static | BindingFlags.Public | BindingFlags.NonPublic);
+
+                if (methodInfo is null)
+                {
+                    throw new Exception("Couldn't find GetPointerToFirstInvalidByte method on Utf8Utility.");
+                }
+
+                return (GetPointerToFirstInvalidByteDel)methodInfo.CreateDelegate(typeof(GetPointerToFirstInvalidByteDel));
+            });
+        }
+    }
+}
index 2718f6f..e5ae72d 100644 (file)
@@ -45,7 +45,14 @@ namespace System.Text.Tests
         public static unsafe void GetByteCount_Invalid(Encoding encoding)
         {
             // Chars is null
-            AssertExtensions.Throws<ArgumentNullException>(encoding is ASCIIEncoding ? "chars" : "s", () => encoding.GetByteCount((string)null));
+            if (PlatformDetection.IsNetCore)
+            {
+                AssertExtensions.Throws<ArgumentNullException>((encoding is ASCIIEncoding || encoding is UTF8Encoding) ? "chars" : "s", () => encoding.GetByteCount((string)null));
+            }
+            else
+            {
+                AssertExtensions.Throws<ArgumentNullException>((encoding is ASCIIEncoding) ? "chars" : "s", () => encoding.GetByteCount((string)null));
+            }
             AssertExtensions.Throws<ArgumentNullException>("chars", () => encoding.GetByteCount((char[])null));
             AssertExtensions.Throws<ArgumentNullException>("chars", () => encoding.GetByteCount((char[])null, 0, 0));
 
index 618858c..03577e7 100644 (file)
@@ -111,7 +111,7 @@ namespace System.Text.Tests
             EncodingHelpers.Decode(new UTF8Encoding(false, true), bytes, index, count, expected);
             EncodingHelpers.Decode(new UTF8Encoding(true, true), bytes, index, count, expected);
         }
-        
+
         public static IEnumerable<object[]> Decode_InvalidBytes_TestData()
         {
             yield return new object[] { new byte[] { 196, 84, 101, 115, 116, 196, 196, 196, 176, 176, 84, 101, 115, 116, 176 }, 0, 15, "\uFFFDTest\uFFFD\uFFFD\u0130\uFFFDTest\uFFFD" };
@@ -126,97 +126,217 @@ namespace System.Text.Tests
             yield return new object[] { validSurrogateBytes, 2, 2, "\uFFFD\uFFFD" };
             yield return new object[] { validSurrogateBytes, 2, 1, "\uFFFD" };
 
-            yield return new object[] { new byte[] { 0xED, 0xA0, 0x80 }, 0, 3, "\uFFFD\uFFFD" };
-            yield return new object[] { new byte[] { 0xED, 0xAF, 0xBF }, 0, 3, "\uFFFD\uFFFD" };
-            yield return new object[] { new byte[] { 0xED, 0xB0, 0x80 }, 0, 3, "\uFFFD\uFFFD" };
-            yield return new object[] { new byte[] { 0xED, 0xBF, 0xBF }, 0, 3, "\uFFFD\uFFFD" };
-
-            // Invalid surrogate pair (low/low, high/high, low/high)
-            yield return new object[] { new byte[] { 0xED, 0xA0, 0x80, 0xED, 0xAF, 0xBF }, 0, 6, "\uFFFD\uFFFD\uFFFD\uFFFD" };
-            yield return new object[] { new byte[] { 0xED, 0xB0, 0x80, 0xED, 0xB0, 0x80 }, 0, 6, "\uFFFD\uFFFD\uFFFD\uFFFD" };
-            yield return new object[] { new byte[] { 0xED, 0xA0, 0x80, 0xED, 0xA0, 0x80 }, 0, 6, "\uFFFD\uFFFD\uFFFD\uFFFD" };
-
-            // Too high scalar value in surrogates
-            yield return new object[] { new byte[] { 0xED, 0xA0, 0x80, 0xEE, 0x80, 0x80 }, 0, 6, "\uFFFD\uFFFD\uE000" };
-            yield return new object[] { new byte[] { 0xF4, 0x90, 0x80, 0x80 }, 0, 4, "\uFFFD\uFFFD\uFFFD" };
-
-            // These are examples of overlong sequences. This can cause security
-            // vulnerabilities (e.g. MS00-078) so it is important we parse these as invalid.
-            yield return new object[] { new byte[] { 0xC0 }, 0, 1, "\uFFFD" };
-            yield return new object[] { new byte[] { 0xC0, 0xAF }, 0, 2, "\uFFFD\uFFFD" };
-            yield return new object[] { new byte[] { 0xE0, 0x80, 0xBF }, 0, 3, "\uFFFD\uFFFD" };
-            yield return new object[] { new byte[] { 0xF0, 0x80, 0x80, 0xBF }, 0, 4, "\uFFFD\uFFFD\uFFFD" };
-            yield return new object[] { new byte[] { 0xF8, 0x80, 0x80, 0x80, 0xBF }, 0, 5, "\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD" };
-            yield return new object[] { new byte[] { 0xFC, 0x80, 0x80, 0x80, 0x80, 0xBF }, 0, 6, "\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD" };
-
-            yield return new object[] { new byte[] { 0xC0, 0xBF }, 0, 2, "\uFFFD\uFFFD" };
-            yield return new object[] { new byte[] { 0xE0, 0x9C, 0x90 }, 0, 3, "\uFFFD\uFFFD" };
-            yield return new object[] { new byte[] { 0xF0, 0x8F, 0xA4, 0x80 }, 0, 4, "\uFFFD\uFFFD\uFFFD" };
-
-            yield return new object[] { new byte[] { 0xEF, 0x41 }, 0, 2, "\uFFFD\u0041" };
-            yield return new object[] { new byte[] { 0xEF, 0xBF, 0xAE }, 0, 1, "\uFFFD" };
-            yield return new object[] { new byte[] { 0xEF, 0xBF, 0x41 }, 0, 3, "\uFFFD\u0041" };
-            yield return new object[] { new byte[] { 0xEF, 0xBF, 0x61 }, 0, 3, "\uFFFD\u0061" };
-            yield return new object[] { new byte[] { 0xEF, 0xBF, 0xEF, 0xBF, 0xAE }, 0, 5, "\uFFFD\uFFEE" };
-            yield return new object[] { new byte[] { 0xEF, 0xBF, 0xC0, 0xBF }, 0, 4, "\uFFFD\uFFFD\uFFFD" };
-
-            yield return new object[] { new byte[] { 0xF0, 0xC4, 0x80 }, 0, 3, "\uFFFD\u0100" };
-
-            yield return new object[] { new byte[] { 176 }, 0, 1, "\uFFFD" };
-            yield return new object[] { new byte[] { 196 }, 0, 1, "\uFFFD" };
-
-            yield return new object[] { new byte[] { 0xA4, 0xD0, 0x61, 0x52, 0x7C, 0x7B, 0x41, 0x6E, 0x47, 0x65, 0xA3, 0xA4 }, 0, 12, "\uFFFD\uFFFD\u0061\u0052\u007C\u007B\u0041\u006E\u0047\u0065\uFFFD\uFFFD" };
-
-            yield return new object[] { new byte[] { 0xA3 }, 0, 1, "\uFFFD" };
-            yield return new object[] { new byte[] { 0xA3, 0xA4 }, 0, 2, "\uFFFD\uFFFD" };
-            yield return new object[] { new byte[] { 0x65, 0xA3, 0xA4 }, 0, 3, "\u0065\uFFFD\uFFFD" };
-            yield return new object[] { new byte[] { 0x47, 0x65, 0xA3, 0xA4 }, 0, 4, "\u0047\u0065\uFFFD\uFFFD" };
-            yield return new object[] { new byte[] { 0xA4, 0xD0, 0x61, 0xA3, 0xA4 }, 0, 5, "\uFFFD\uFFFD\u0061\uFFFD\uFFFD" };
-            yield return new object[] { new byte[] { 0xA4, 0xD0, 0x61, 0xA3 }, 0, 4, "\uFFFD\uFFFD\u0061\uFFFD" };
-            yield return new object[] { new byte[] { 0xD0, 0x61, 0xA3 }, 0, 3, "\uFFFD\u0061\uFFFD" };
-            yield return new object[] { new byte[] { 0xA4, 0x61, 0xA3 }, 0, 3, "\uFFFD\u0061\uFFFD" };
-            yield return new object[] { new byte[] { 0xD0, 0x61, 0x52, 0xA3 }, 0, 4, "\uFFFD\u0061\u0052\uFFFD" };
-                        
-            yield return new object[] { new byte[] { 0xAA }, 0, 1, "\uFFFD" };
-            yield return new object[] { new byte[] { 0xAA, 0x41 }, 0, 2, "\uFFFD\u0041" };
-
-            yield return new object[] { new byte[] { 0xEF, 0xFF, 0xEE }, 0, 3, "\uFFFD\uFFFD\uFFFD" };
-            yield return new object[] { new byte[] { 0xEF, 0xFF, 0xAE }, 0, 3, "\uFFFD\uFFFD\uFFFD" };
-
-            yield return new object[] { new byte[] { 0x80, 0x90, 0xA0, 0xB0, 0xC1 }, 0, 5, "\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD" };
-            yield return new object[] { new byte[] { 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x80, 0x90, 0xA0, 0xB0, 0xC1 }, 0, 15, "\u007F\u007F\u007F\u007F\u007F\u007F\u007F\u007F\u007F\u007F\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD" };
-            yield return new object[] { new byte[] { 0x80, 0x90, 0xA0, 0xB0, 0xC1, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F }, 0, 15, "\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\u007F\u007F\u007F\u007F\u007F\u007F\u007F\u007F\u007F\u007F" };
-
-            yield return new object[] { new byte[] { 0xC2, 0x7F, 0xC2, 0xC0, 0xDF, 0x7F, 0xDF, 0xC0 }, 0, 8, "\uFFFD\u007F\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD" };
-            yield return new object[] { new byte[] { 0xC2, 0xDF }, 0, 2, "\uFFFD\uFFFD" };
-            yield return new object[] { new byte[] { 0x80, 0x80, 0xC1, 0x80, 0xC1, 0xBF }, 0, 6, "\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD" };
-            yield return new object[] { new byte[] { 0xC2, 0x7F, 0xC2, 0xC0, 0x7F, 0x7F, 0x7F, 0x7F, 0xC3, 0xA1, 0xDF, 0x7F, 0xDF, 0xC0 }, 0, 14, "\uFFFD\u007F\uFFFD\uFFFD\u007F\u007F\u007F\u007F\u00E1\uFFFD\u007F\uFFFD\uFFFD" };
-
-            yield return new object[] { new byte[] { 0xE0, 0xA0, 0x7F, 0xE0, 0xA0, 0xC0, 0xE0, 0xBF, 0x7F, 0xE0, 0xBF, 0xC0 }, 0, 12, "\uFFFD\u007F\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD" };
-            yield return new object[] { new byte[] { 0xE0, 0x9F, 0x80, 0xE0, 0xC0, 0x80, 0xE0, 0x9F, 0xBF, 0xE0, 0xC0, 0xBF }, 0, 12, "\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD" };
-            yield return new object[] { new byte[] { 0xE0, 0xA0, 0x7F, 0xE0, 0xA0, 0xC0, 0x7F, 0xE0, 0xBF, 0x7F, 0xC3, 0xA1, 0xE0, 0xBF, 0xC0 }, 0, 15, "\uFFFD\u007F\uFFFD\uFFFD\u007F\uFFFD\u007F\u00E1\uFFFD\uFFFD" };
-
-            yield return new object[] { new byte[] { 0xE1, 0x80, 0x7F, 0xE1, 0x80, 0xC0, 0xE1, 0xBF, 0x7F, 0xE1, 0xBF, 0xC0, 0xEC, 0x80, 0x7F, 0xEC, 0x80, 0xC0, 0xEC, 0xBF, 0x7F, 0xEC, 0xBF, 0xC0 }, 0, 24, "\uFFFD\u007F\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD" };
-            yield return new object[] { new byte[] { 0xE1, 0x7F, 0x80, 0xE1, 0xC0, 0x80, 0xE1, 0x7F, 0xBF, 0xE1, 0xC0, 0xBF, 0xEC, 0x7F, 0x80, 0xEC, 0xC0, 0x80, 0xEC, 0x7F, 0xBF, 0xEC, 0xC0, 0xBF }, 0, 24, "\uFFFD\u007F\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD\uFFFD\uFFFD" };
-
-            yield return new object[] { new byte[] { 0xED, 0x80, 0x7F, 0xED, 0x80, 0xC0, 0xED, 0x9F, 0x7F, 0xED, 0x9F, 0xC0 }, 0, 12, "\uFFFD\u007F\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD" };
-            yield return new object[] { new byte[] { 0xED, 0x7F, 0x80, 0xED, 0xA0, 0x80, 0xED, 0x7F, 0xBF, 0xED, 0xA0, 0xBF }, 0, 12, "\uFFFD\u007F\uFFFD\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD\uFFFD" };
-            yield return new object[] { new byte[] { 0xED, 0x7F, 0x80, 0xED, 0xA0, 0x80, 0xE8, 0x80, 0x80, 0xED, 0x7F, 0xBF, 0xED, 0xA0, 0xBF }, 0, 15, "\uFFFD\u007F\uFFFD\uFFFD\uFFFD\u8000\uFFFD\u007F\uFFFD\uFFFD\uFFFD" };
-
-            yield return new object[] { new byte[] { 0xEE, 0x80, 0x7F, 0xEE, 0x80, 0xC0, 0xEE, 0xBF, 0x7F, 0xEE, 0xBF, 0xC0, 0xEF, 0x80, 0x7F, 0xEF, 0x80, 0xC0, 0xEF, 0xBF, 0x7F, 0xEF, 0xBF, 0xC0 }, 0, 24, "\uFFFD\u007F\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD" };
-            yield return new object[] { new byte[] { 0xEE, 0x7F, 0x80, 0xEE, 0xC0, 0x80, 0xEE, 0x7F, 0xBF, 0xEE, 0xC0, 0xBF, 0xEF, 0x7F, 0x80, 0xEF, 0xC0, 0x80, 0xEF, 0x7F, 0xBF, 0xEF, 0xC0, 0xBF }, 0, 24, "\uFFFD\u007F\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD\uFFFD\uFFFD" };
-
-            yield return new object[] { new byte[] { 0xF0, 0x90, 0x80, 0x7F, 0xF0, 0x90, 0x80, 0xC0, 0xF0, 0xBF, 0xBF, 0x7F, 0xF0, 0xBF, 0xBF, 0xC0 }, 0, 16, "\uFFFD\u007F\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD" };
-            yield return new object[] { new byte[] { 0xF0, 0x90, 0x7F, 0x80, 0xF0, 0x90, 0xC0, 0x80, 0xF0, 0x90, 0x7F, 0xBF, 0xF0, 0x90, 0xC0, 0xBF }, 0, 16, "\uFFFD\u007F\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD\uFFFD\uFFFD" };
-            yield return new object[] { new byte[] { 0xF0, 0x8F, 0x80, 0x80, 0xF0, 0xC0, 0x80, 0x80, 0xF0, 0x8F, 0xBF, 0xBF, 0xF0, 0xC0, 0xBF, 0xBF }, 0, 16, "\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD" };
-
-            yield return new object[] { new byte[] { 0xF1, 0x80, 0x80, 0x7F, 0xF1, 0x80, 0x80, 0xC0, 0xF1, 0xBF, 0xBF, 0x7F, 0xF1, 0xBF, 0xBF, 0xC0, 0xF3, 0x80, 0x80, 0x7F, 0xF3, 0x80, 0x80, 0xC0, 0xF3, 0xBF, 0xBF, 0x7F, 0xF3, 0xBF, 0xBF, 0xC0 }, 0, 32, "\uFFFD\u007F\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD" };
-            yield return new object[] { new byte[] { 0xF1, 0x80, 0x7F, 0x80, 0xF1, 0x80, 0xC0, 0x80, 0xF1, 0x80, 0x7F, 0xBF, 0xF1, 0x80, 0xC0, 0xBF, 0xF3, 0x80, 0x7F, 0x80, 0xF3, 0x80, 0xC0, 0x80, 0xF3, 0x80, 0x7F, 0xBF, 0xF3, 0x80, 0xC0, 0xBF }, 0, 32, "\uFFFD\u007F\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD\uFFFD\uFFFD" };
-            yield return new object[] { new byte[] { 0xF1, 0x7F, 0x80, 0x80, 0xF1, 0xC0, 0x80, 0x80, 0xF1, 0x7F, 0xBF, 0xBF, 0xF1, 0xC0, 0xBF, 0xBF, 0xF3, 0x7F, 0x80, 0x80, 0xF3, 0xC0, 0x80, 0x80, 0xF3, 0x7F, 0xBF, 0xBF, 0xF3, 0xC0, 0xBF, 0xBF }, 0, 32, "\uFFFD\u007F\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD" };
-
-            yield return new object[] { new byte[] { 0xF4, 0x80, 0x80, 0x7F, 0xF4, 0x80, 0x80, 0xC0, 0xF4, 0x8F, 0xBF, 0x7F, 0xF4, 0x8F, 0xBF, 0xC0 }, 0, 16, "\uFFFD\u007F\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD" };
-            yield return new object[] { new byte[] { 0xF4, 0x80, 0x7F, 0x80, 0xF4, 0x80, 0xC0, 0x80, 0xF4, 0x80, 0x7F, 0xBF, 0xF4, 0x80, 0xC0, 0xBF }, 0, 16, "\uFFFD\u007F\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD\uFFFD\uFFFD" };
-            yield return new object[] { new byte[] { 0xF4, 0x7F, 0x80, 0x80, 0xF4, 0x90, 0x80, 0x80, 0xF4, 0x7F, 0xBF, 0xBF, 0xF4, 0x90, 0xBF, 0xBF }, 0, 16, "\uFFFD\u007F\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD" };
+            if (PlatformDetection.IsNetCore)
+            {
+                // Overlong 2-byte sequences
+                yield return new object[] { new byte[] { 0xC0, 0x80 }, 0, 2, "\uFFFD\uFFFD" };
+                yield return new object[] { new byte[] { 0xC1, 0x80 }, 0, 2, "\uFFFD\uFFFD" };
+
+                // Incomplete 2-byte sequences
+                yield return new object[] { new byte[] { 0xC2, 0x41 }, 0, 2, "\uFFFD\u0041" };
+                yield return new object[] { new byte[] { 0xC2, 0x41 }, 0, 2, "\uFFFD\u0041" };
+
+                // Overlong 3-byte sequences
+                yield return new object[] { new byte[] { 0xE0, 0x80, 0x80 }, 0, 3, "\uFFFD\uFFFD\uFFFD" };
+                yield return new object[] { new byte[] { 0xE0, 0x9F, 0x80 }, 0, 3, "\uFFFD\uFFFD\uFFFD" };
+
+                // Truncated 3-byte sequences
+                yield return new object[] { new byte[] { 0xE0, 0xA0, 0x41 }, 0, 3, "\uFFFD\u0041" };
+                yield return new object[] { new byte[] { 0xED, 0x9F, 0x41 }, 0, 3, "\uFFFD\u0041" };
+
+                // UTF-16 surrogate code points (invalid to be encoded in UTF-8)
+                yield return new object[] { new byte[] { 0xED, 0xA0, 0x80 }, 0, 3, "\uFFFD\uFFFD\uFFFD" };
+                yield return new object[] { new byte[] { 0xED, 0xAF, 0xBF }, 0, 3, "\uFFFD\uFFFD\uFFFD" };
+                yield return new object[] { new byte[] { 0xED, 0xB0, 0x80 }, 0, 3, "\uFFFD\uFFFD\uFFFD" };
+                yield return new object[] { new byte[] { 0xED, 0xBF, 0xBF }, 0, 3, "\uFFFD\uFFFD\uFFFD" };
+                yield return new object[] { new byte[] { 0xED, 0xA0, 0x80, 0xED, 0xAF, 0xBF }, 0, 6, "\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD" };
+                yield return new object[] { new byte[] { 0xED, 0xB0, 0x80, 0xED, 0xB0, 0x80 }, 0, 6, "\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD" };
+                yield return new object[] { new byte[] { 0xED, 0xA0, 0x80, 0xED, 0xA0, 0x80 }, 0, 6, "\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD" };
+
+                // Overlong 4-byte sequences
+                yield return new object[] { new byte[] { 0xF0, 0x80, 0x80 }, 0, 3, "\uFFFD\uFFFD\uFFFD" };
+                yield return new object[] { new byte[] { 0xF0, 0x8F, 0x80 }, 0, 3, "\uFFFD\uFFFD\uFFFD" };
+
+                // Truncated 4-byte sequences
+                yield return new object[] { new byte[] { 0xF0, 0x90, 0x41, 0x42 }, 0, 4, "\uFFFD\u0041\u0042" };
+                yield return new object[] { new byte[] { 0xF0, 0x90, 0x80, 0x42 }, 0, 4, "\uFFFD\u0042" };
+
+                // Too high scalar value in surrogates
+                yield return new object[] { new byte[] { 0xED, 0xA0, 0x80, 0xEE, 0x80, 0x80 }, 0, 6, "\uFFFD\uFFFD\uFFFD\uE000" };
+                yield return new object[] { new byte[] { 0xF4, 0x90, 0x80, 0x80 }, 0, 4, "\uFFFD\uFFFD\uFFFD\uFFFD" };
+
+                // More examples of overlong sequences. This can cause security
+                // vulnerabilities (e.g. MS00-078) so it is important we parse these as invalid.
+                yield return new object[] { new byte[] { 0xC0 }, 0, 1, "\uFFFD" };
+                yield return new object[] { new byte[] { 0xC0, 0xAF }, 0, 2, "\uFFFD\uFFFD" };
+                yield return new object[] { new byte[] { 0xE0, 0x80, 0xBF }, 0, 3, "\uFFFD\uFFFD\uFFFD" };
+                yield return new object[] { new byte[] { 0xF0, 0x80, 0x80, 0xBF }, 0, 4, "\uFFFD\uFFFD\uFFFD\uFFFD" };
+                yield return new object[] { new byte[] { 0xF8, 0x80, 0x80, 0x80, 0xBF }, 0, 5, "\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD" };
+                yield return new object[] { new byte[] { 0xFC, 0x80, 0x80, 0x80, 0x80, 0xBF }, 0, 6, "\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD" };
+
+                yield return new object[] { new byte[] { 0xC0, 0xBF }, 0, 2, "\uFFFD\uFFFD" };
+                yield return new object[] { new byte[] { 0xE0, 0x9C, 0x90 }, 0, 3, "\uFFFD\uFFFD\uFFFD" };
+                yield return new object[] { new byte[] { 0xF0, 0x8F, 0xA4, 0x80 }, 0, 4, "\uFFFD\uFFFD\uFFFD\uFFFD" };
+
+                yield return new object[] { new byte[] { 0xEF, 0x41 }, 0, 2, "\uFFFD\u0041" };
+                yield return new object[] { new byte[] { 0xEF, 0xBF, 0xAE }, 0, 1, "\uFFFD" };
+                yield return new object[] { new byte[] { 0xEF, 0xBF, 0x41 }, 0, 3, "\uFFFD\u0041" };
+                yield return new object[] { new byte[] { 0xEF, 0xBF, 0x61 }, 0, 3, "\uFFFD\u0061" };
+                yield return new object[] { new byte[] { 0xEF, 0xBF, 0xEF, 0xBF, 0xAE }, 0, 5, "\uFFFD\uFFEE" };
+                yield return new object[] { new byte[] { 0xEF, 0xBF, 0xC0, 0xBF }, 0, 4, "\uFFFD\uFFFD\uFFFD" };
+
+                yield return new object[] { new byte[] { 0xF0, 0xC4, 0x80 }, 0, 3, "\uFFFD\u0100" };
+
+                yield return new object[] { new byte[] { 176 }, 0, 1, "\uFFFD" };
+                yield return new object[] { new byte[] { 196 }, 0, 1, "\uFFFD" };
+
+                yield return new object[] { new byte[] { 0xA4, 0xD0, 0x61, 0x52, 0x7C, 0x7B, 0x41, 0x6E, 0x47, 0x65, 0xA3, 0xA4 }, 0, 12, "\uFFFD\uFFFD\u0061\u0052\u007C\u007B\u0041\u006E\u0047\u0065\uFFFD\uFFFD" };
+
+                yield return new object[] { new byte[] { 0xA3 }, 0, 1, "\uFFFD" };
+                yield return new object[] { new byte[] { 0xA3, 0xA4 }, 0, 2, "\uFFFD\uFFFD" };
+                yield return new object[] { new byte[] { 0x65, 0xA3, 0xA4 }, 0, 3, "\u0065\uFFFD\uFFFD" };
+                yield return new object[] { new byte[] { 0x47, 0x65, 0xA3, 0xA4 }, 0, 4, "\u0047\u0065\uFFFD\uFFFD" };
+                yield return new object[] { new byte[] { 0xA4, 0xD0, 0x61, 0xA3, 0xA4 }, 0, 5, "\uFFFD\uFFFD\u0061\uFFFD\uFFFD" };
+                yield return new object[] { new byte[] { 0xA4, 0xD0, 0x61, 0xA3 }, 0, 4, "\uFFFD\uFFFD\u0061\uFFFD" };
+                yield return new object[] { new byte[] { 0xD0, 0x61, 0xA3 }, 0, 3, "\uFFFD\u0061\uFFFD" };
+                yield return new object[] { new byte[] { 0xA4, 0x61, 0xA3 }, 0, 3, "\uFFFD\u0061\uFFFD" };
+                yield return new object[] { new byte[] { 0xD0, 0x61, 0x52, 0xA3 }, 0, 4, "\uFFFD\u0061\u0052\uFFFD" };
+
+                yield return new object[] { new byte[] { 0xAA }, 0, 1, "\uFFFD" };
+                yield return new object[] { new byte[] { 0xAA, 0x41 }, 0, 2, "\uFFFD\u0041" };
+
+                yield return new object[] { new byte[] { 0xEF, 0xFF, 0xEE }, 0, 3, "\uFFFD\uFFFD\uFFFD" };
+                yield return new object[] { new byte[] { 0xEF, 0xFF, 0xAE }, 0, 3, "\uFFFD\uFFFD\uFFFD" };
+
+                yield return new object[] { new byte[] { 0x80, 0x90, 0xA0, 0xB0, 0xC1 }, 0, 5, "\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD" };
+                yield return new object[] { new byte[] { 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x80, 0x90, 0xA0, 0xB0, 0xC1 }, 0, 15, "\u007F\u007F\u007F\u007F\u007F\u007F\u007F\u007F\u007F\u007F\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD" };
+                yield return new object[] { new byte[] { 0x80, 0x90, 0xA0, 0xB0, 0xC1, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F }, 0, 15, "\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\u007F\u007F\u007F\u007F\u007F\u007F\u007F\u007F\u007F\u007F" };
+
+                yield return new object[] { new byte[] { 0xC2, 0x7F, 0xC2, 0xC0, 0xDF, 0x7F, 0xDF, 0xC0 }, 0, 8, "\uFFFD\u007F\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD" };
+                yield return new object[] { new byte[] { 0xC2, 0xDF }, 0, 2, "\uFFFD\uFFFD" };
+                yield return new object[] { new byte[] { 0x80, 0x80, 0xC1, 0x80, 0xC1, 0xBF }, 0, 6, "\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD" };
+                yield return new object[] { new byte[] { 0xC2, 0x7F, 0xC2, 0xC0, 0x7F, 0x7F, 0x7F, 0x7F, 0xC3, 0xA1, 0xDF, 0x7F, 0xDF, 0xC0 }, 0, 14, "\uFFFD\u007F\uFFFD\uFFFD\u007F\u007F\u007F\u007F\u00E1\uFFFD\u007F\uFFFD\uFFFD" };
+
+                yield return new object[] { new byte[] { 0xE0, 0xA0, 0x7F, 0xE0, 0xA0, 0xC0, 0xE0, 0xBF, 0x7F, 0xE0, 0xBF, 0xC0 }, 0, 12, "\uFFFD\u007F\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD" };
+                yield return new object[] { new byte[] { 0xE0, 0x9F, 0x80, 0xE0, 0xC0, 0x80, 0xE0, 0x9F, 0xBF, 0xE0, 0xC0, 0xBF }, 0, 12, "\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD" };
+                yield return new object[] { new byte[] { 0xE0, 0xA0, 0x7F, 0xE0, 0xA0, 0xC0, 0x7F, 0xE0, 0xBF, 0x7F, 0xC3, 0xA1, 0xE0, 0xBF, 0xC0 }, 0, 15, "\uFFFD\u007F\uFFFD\uFFFD\u007F\uFFFD\u007F\u00E1\uFFFD\uFFFD" };
+
+                yield return new object[] { new byte[] { 0xE1, 0x80, 0x7F, 0xE1, 0x80, 0xC0, 0xE1, 0xBF, 0x7F, 0xE1, 0xBF, 0xC0, 0xEC, 0x80, 0x7F, 0xEC, 0x80, 0xC0, 0xEC, 0xBF, 0x7F, 0xEC, 0xBF, 0xC0 }, 0, 24, "\uFFFD\u007F\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD" };
+                yield return new object[] { new byte[] { 0xE1, 0x7F, 0x80, 0xE1, 0xC0, 0x80, 0xE1, 0x7F, 0xBF, 0xE1, 0xC0, 0xBF, 0xEC, 0x7F, 0x80, 0xEC, 0xC0, 0x80, 0xEC, 0x7F, 0xBF, 0xEC, 0xC0, 0xBF }, 0, 24, "\uFFFD\u007F\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD\uFFFD\uFFFD" };
+
+                yield return new object[] { new byte[] { 0xED, 0x80, 0x7F, 0xED, 0x80, 0xC0, 0xED, 0x9F, 0x7F, 0xED, 0x9F, 0xC0 }, 0, 12, "\uFFFD\u007F\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD" };
+                yield return new object[] { new byte[] { 0xED, 0x7F, 0x80, 0xED, 0xA0, 0x80, 0xED, 0x7F, 0xBF, 0xED, 0xA0, 0xBF }, 0, 12, "\uFFFD\u007F\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD\uFFFD\uFFFD" };
+                yield return new object[] { new byte[] { 0xED, 0x7F, 0x80, 0xED, 0xA0, 0x80, 0xE8, 0x80, 0x80, 0xED, 0x7F, 0xBF, 0xED, 0xA0, 0xBF }, 0, 15, "\uFFFD\u007F\uFFFD\uFFFD\uFFFD\uFFFD\u8000\uFFFD\u007F\uFFFD\uFFFD\uFFFD\uFFFD" };
+
+                yield return new object[] { new byte[] { 0xEE, 0x80, 0x7F, 0xEE, 0x80, 0xC0, 0xEE, 0xBF, 0x7F, 0xEE, 0xBF, 0xC0, 0xEF, 0x80, 0x7F, 0xEF, 0x80, 0xC0, 0xEF, 0xBF, 0x7F, 0xEF, 0xBF, 0xC0 }, 0, 24, "\uFFFD\u007F\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD" };
+                yield return new object[] { new byte[] { 0xEE, 0x7F, 0x80, 0xEE, 0xC0, 0x80, 0xEE, 0x7F, 0xBF, 0xEE, 0xC0, 0xBF, 0xEF, 0x7F, 0x80, 0xEF, 0xC0, 0x80, 0xEF, 0x7F, 0xBF, 0xEF, 0xC0, 0xBF }, 0, 24, "\uFFFD\u007F\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD\uFFFD\uFFFD" };
+
+                yield return new object[] { new byte[] { 0xF0, 0x90, 0x80, 0x7F, 0xF0, 0x90, 0x80, 0xC0, 0xF0, 0xBF, 0xBF, 0x7F, 0xF0, 0xBF, 0xBF, 0xC0 }, 0, 16, "\uFFFD\u007F\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD" };
+                yield return new object[] { new byte[] { 0xF0, 0x90, 0x7F, 0x80, 0xF0, 0x90, 0xC0, 0x80, 0xF0, 0x90, 0x7F, 0xBF, 0xF0, 0x90, 0xC0, 0xBF }, 0, 16, "\uFFFD\u007F\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD\uFFFD\uFFFD" };
+                yield return new object[] { new byte[] { 0xF0, 0x8F, 0x80, 0x80, 0xF0, 0xC0, 0x80, 0x80, 0xF0, 0x8F, 0xBF, 0xBF, 0xF0, 0xC0, 0xBF, 0xBF }, 0, 16, "\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD" };
+
+                yield return new object[] { new byte[] { 0xF1, 0x80, 0x80, 0x7F, 0xF1, 0x80, 0x80, 0xC0, 0xF1, 0xBF, 0xBF, 0x7F, 0xF1, 0xBF, 0xBF, 0xC0, 0xF3, 0x80, 0x80, 0x7F, 0xF3, 0x80, 0x80, 0xC0, 0xF3, 0xBF, 0xBF, 0x7F, 0xF3, 0xBF, 0xBF, 0xC0 }, 0, 32, "\uFFFD\u007F\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD" };
+                yield return new object[] { new byte[] { 0xF1, 0x80, 0x7F, 0x80, 0xF1, 0x80, 0xC0, 0x80, 0xF1, 0x80, 0x7F, 0xBF, 0xF1, 0x80, 0xC0, 0xBF, 0xF3, 0x80, 0x7F, 0x80, 0xF3, 0x80, 0xC0, 0x80, 0xF3, 0x80, 0x7F, 0xBF, 0xF3, 0x80, 0xC0, 0xBF }, 0, 32, "\uFFFD\u007F\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD\uFFFD\uFFFD" };
+                yield return new object[] { new byte[] { 0xF1, 0x7F, 0x80, 0x80, 0xF1, 0xC0, 0x80, 0x80, 0xF1, 0x7F, 0xBF, 0xBF, 0xF1, 0xC0, 0xBF, 0xBF, 0xF3, 0x7F, 0x80, 0x80, 0xF3, 0xC0, 0x80, 0x80, 0xF3, 0x7F, 0xBF, 0xBF, 0xF3, 0xC0, 0xBF, 0xBF }, 0, 32, "\uFFFD\u007F\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD" };
+
+                yield return new object[] { new byte[] { 0xF4, 0x80, 0x80, 0x7F, 0xF4, 0x80, 0x80, 0xC0, 0xF4, 0x8F, 0xBF, 0x7F, 0xF4, 0x8F, 0xBF, 0xC0 }, 0, 16, "\uFFFD\u007F\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD" };
+                yield return new object[] { new byte[] { 0xF4, 0x80, 0x7F, 0x80, 0xF4, 0x80, 0xC0, 0x80, 0xF4, 0x80, 0x7F, 0xBF, 0xF4, 0x80, 0xC0, 0xBF }, 0, 16, "\uFFFD\u007F\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD\uFFFD\uFFFD" };
+                yield return new object[] { new byte[] { 0xF4, 0x7F, 0x80, 0x80, 0xF4, 0x90, 0x80, 0x80, 0xF4, 0x7F, 0xBF, 0xBF, 0xF4, 0x90, 0xBF, 0xBF }, 0, 16, "\uFFFD\u007F\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD" };
+            }
+            else
+            {
+                yield return new object[] { new byte[] { 0xED, 0xA0, 0x80 }, 0, 3, "\uFFFD\uFFFD" };
+                yield return new object[] { new byte[] { 0xED, 0xAF, 0xBF }, 0, 3, "\uFFFD\uFFFD" };
+                yield return new object[] { new byte[] { 0xED, 0xB0, 0x80 }, 0, 3, "\uFFFD\uFFFD" };
+                yield return new object[] { new byte[] { 0xED, 0xBF, 0xBF }, 0, 3, "\uFFFD\uFFFD" };
+
+                // Invalid surrogate pair (low/low, high/high, low/high)
+                yield return new object[] { new byte[] { 0xED, 0xA0, 0x80, 0xED, 0xAF, 0xBF }, 0, 6, "\uFFFD\uFFFD\uFFFD\uFFFD" };
+                yield return new object[] { new byte[] { 0xED, 0xB0, 0x80, 0xED, 0xB0, 0x80 }, 0, 6, "\uFFFD\uFFFD\uFFFD\uFFFD" };
+                yield return new object[] { new byte[] { 0xED, 0xA0, 0x80, 0xED, 0xA0, 0x80 }, 0, 6, "\uFFFD\uFFFD\uFFFD\uFFFD" };
+
+                // Too high scalar value in surrogates
+                yield return new object[] { new byte[] { 0xED, 0xA0, 0x80, 0xEE, 0x80, 0x80 }, 0, 6, "\uFFFD\uFFFD\uE000" };
+                yield return new object[] { new byte[] { 0xF4, 0x90, 0x80, 0x80 }, 0, 4, "\uFFFD\uFFFD\uFFFD" };
+
+                // These are examples of overlong sequences. This can cause security
+                // vulnerabilities (e.g. MS00-078) so it is important we parse these as invalid.
+                yield return new object[] { new byte[] { 0xC0 }, 0, 1, "\uFFFD" };
+                yield return new object[] { new byte[] { 0xC0, 0xAF }, 0, 2, "\uFFFD\uFFFD" };
+                yield return new object[] { new byte[] { 0xE0, 0x80, 0xBF }, 0, 3, "\uFFFD\uFFFD" };
+                yield return new object[] { new byte[] { 0xF0, 0x80, 0x80, 0xBF }, 0, 4, "\uFFFD\uFFFD\uFFFD" };
+                yield return new object[] { new byte[] { 0xF8, 0x80, 0x80, 0x80, 0xBF }, 0, 5, "\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD" };
+                yield return new object[] { new byte[] { 0xFC, 0x80, 0x80, 0x80, 0x80, 0xBF }, 0, 6, "\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD" };
+
+                yield return new object[] { new byte[] { 0xC0, 0xBF }, 0, 2, "\uFFFD\uFFFD" };
+                yield return new object[] { new byte[] { 0xE0, 0x9C, 0x90 }, 0, 3, "\uFFFD\uFFFD" };
+                yield return new object[] { new byte[] { 0xF0, 0x8F, 0xA4, 0x80 }, 0, 4, "\uFFFD\uFFFD\uFFFD" };
+
+                yield return new object[] { new byte[] { 0xEF, 0x41 }, 0, 2, "\uFFFD\u0041" };
+                yield return new object[] { new byte[] { 0xEF, 0xBF, 0xAE }, 0, 1, "\uFFFD" };
+                yield return new object[] { new byte[] { 0xEF, 0xBF, 0x41 }, 0, 3, "\uFFFD\u0041" };
+                yield return new object[] { new byte[] { 0xEF, 0xBF, 0x61 }, 0, 3, "\uFFFD\u0061" };
+                yield return new object[] { new byte[] { 0xEF, 0xBF, 0xEF, 0xBF, 0xAE }, 0, 5, "\uFFFD\uFFEE" };
+                yield return new object[] { new byte[] { 0xEF, 0xBF, 0xC0, 0xBF }, 0, 4, "\uFFFD\uFFFD\uFFFD" };
+
+                yield return new object[] { new byte[] { 0xF0, 0xC4, 0x80 }, 0, 3, "\uFFFD\u0100" };
+
+                yield return new object[] { new byte[] { 176 }, 0, 1, "\uFFFD" };
+                yield return new object[] { new byte[] { 196 }, 0, 1, "\uFFFD" };
+
+                yield return new object[] { new byte[] { 0xA4, 0xD0, 0x61, 0x52, 0x7C, 0x7B, 0x41, 0x6E, 0x47, 0x65, 0xA3, 0xA4 }, 0, 12, "\uFFFD\uFFFD\u0061\u0052\u007C\u007B\u0041\u006E\u0047\u0065\uFFFD\uFFFD" };
+
+                yield return new object[] { new byte[] { 0xA3 }, 0, 1, "\uFFFD" };
+                yield return new object[] { new byte[] { 0xA3, 0xA4 }, 0, 2, "\uFFFD\uFFFD" };
+                yield return new object[] { new byte[] { 0x65, 0xA3, 0xA4 }, 0, 3, "\u0065\uFFFD\uFFFD" };
+                yield return new object[] { new byte[] { 0x47, 0x65, 0xA3, 0xA4 }, 0, 4, "\u0047\u0065\uFFFD\uFFFD" };
+                yield return new object[] { new byte[] { 0xA4, 0xD0, 0x61, 0xA3, 0xA4 }, 0, 5, "\uFFFD\uFFFD\u0061\uFFFD\uFFFD" };
+                yield return new object[] { new byte[] { 0xA4, 0xD0, 0x61, 0xA3 }, 0, 4, "\uFFFD\uFFFD\u0061\uFFFD" };
+                yield return new object[] { new byte[] { 0xD0, 0x61, 0xA3 }, 0, 3, "\uFFFD\u0061\uFFFD" };
+                yield return new object[] { new byte[] { 0xA4, 0x61, 0xA3 }, 0, 3, "\uFFFD\u0061\uFFFD" };
+                yield return new object[] { new byte[] { 0xD0, 0x61, 0x52, 0xA3 }, 0, 4, "\uFFFD\u0061\u0052\uFFFD" };
+
+                yield return new object[] { new byte[] { 0xAA }, 0, 1, "\uFFFD" };
+                yield return new object[] { new byte[] { 0xAA, 0x41 }, 0, 2, "\uFFFD\u0041" };
+
+                yield return new object[] { new byte[] { 0xEF, 0xFF, 0xEE }, 0, 3, "\uFFFD\uFFFD\uFFFD" };
+                yield return new object[] { new byte[] { 0xEF, 0xFF, 0xAE }, 0, 3, "\uFFFD\uFFFD\uFFFD" };
+
+                yield return new object[] { new byte[] { 0x80, 0x90, 0xA0, 0xB0, 0xC1 }, 0, 5, "\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD" };
+                yield return new object[] { new byte[] { 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x80, 0x90, 0xA0, 0xB0, 0xC1 }, 0, 15, "\u007F\u007F\u007F\u007F\u007F\u007F\u007F\u007F\u007F\u007F\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD" };
+                yield return new object[] { new byte[] { 0x80, 0x90, 0xA0, 0xB0, 0xC1, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F }, 0, 15, "\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\u007F\u007F\u007F\u007F\u007F\u007F\u007F\u007F\u007F\u007F" };
+
+                yield return new object[] { new byte[] { 0xC2, 0x7F, 0xC2, 0xC0, 0xDF, 0x7F, 0xDF, 0xC0 }, 0, 8, "\uFFFD\u007F\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD" };
+                yield return new object[] { new byte[] { 0xC2, 0xDF }, 0, 2, "\uFFFD\uFFFD" };
+                yield return new object[] { new byte[] { 0x80, 0x80, 0xC1, 0x80, 0xC1, 0xBF }, 0, 6, "\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD" };
+                yield return new object[] { new byte[] { 0xC2, 0x7F, 0xC2, 0xC0, 0x7F, 0x7F, 0x7F, 0x7F, 0xC3, 0xA1, 0xDF, 0x7F, 0xDF, 0xC0 }, 0, 14, "\uFFFD\u007F\uFFFD\uFFFD\u007F\u007F\u007F\u007F\u00E1\uFFFD\u007F\uFFFD\uFFFD" };
+
+                yield return new object[] { new byte[] { 0xE0, 0xA0, 0x7F, 0xE0, 0xA0, 0xC0, 0xE0, 0xBF, 0x7F, 0xE0, 0xBF, 0xC0 }, 0, 12, "\uFFFD\u007F\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD" };
+                yield return new object[] { new byte[] { 0xE0, 0x9F, 0x80, 0xE0, 0xC0, 0x80, 0xE0, 0x9F, 0xBF, 0xE0, 0xC0, 0xBF }, 0, 12, "\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD" };
+                yield return new object[] { new byte[] { 0xE0, 0xA0, 0x7F, 0xE0, 0xA0, 0xC0, 0x7F, 0xE0, 0xBF, 0x7F, 0xC3, 0xA1, 0xE0, 0xBF, 0xC0 }, 0, 15, "\uFFFD\u007F\uFFFD\uFFFD\u007F\uFFFD\u007F\u00E1\uFFFD\uFFFD" };
+
+                yield return new object[] { new byte[] { 0xE1, 0x80, 0x7F, 0xE1, 0x80, 0xC0, 0xE1, 0xBF, 0x7F, 0xE1, 0xBF, 0xC0, 0xEC, 0x80, 0x7F, 0xEC, 0x80, 0xC0, 0xEC, 0xBF, 0x7F, 0xEC, 0xBF, 0xC0 }, 0, 24, "\uFFFD\u007F\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD" };
+                yield return new object[] { new byte[] { 0xE1, 0x7F, 0x80, 0xE1, 0xC0, 0x80, 0xE1, 0x7F, 0xBF, 0xE1, 0xC0, 0xBF, 0xEC, 0x7F, 0x80, 0xEC, 0xC0, 0x80, 0xEC, 0x7F, 0xBF, 0xEC, 0xC0, 0xBF }, 0, 24, "\uFFFD\u007F\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD\uFFFD\uFFFD" };
+
+                yield return new object[] { new byte[] { 0xED, 0x80, 0x7F, 0xED, 0x80, 0xC0, 0xED, 0x9F, 0x7F, 0xED, 0x9F, 0xC0 }, 0, 12, "\uFFFD\u007F\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD" };
+                yield return new object[] { new byte[] { 0xED, 0x7F, 0x80, 0xED, 0xA0, 0x80, 0xED, 0x7F, 0xBF, 0xED, 0xA0, 0xBF }, 0, 12, "\uFFFD\u007F\uFFFD\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD\uFFFD" };
+                yield return new object[] { new byte[] { 0xED, 0x7F, 0x80, 0xED, 0xA0, 0x80, 0xE8, 0x80, 0x80, 0xED, 0x7F, 0xBF, 0xED, 0xA0, 0xBF }, 0, 15, "\uFFFD\u007F\uFFFD\uFFFD\uFFFD\u8000\uFFFD\u007F\uFFFD\uFFFD\uFFFD" };
+
+                yield return new object[] { new byte[] { 0xEE, 0x80, 0x7F, 0xEE, 0x80, 0xC0, 0xEE, 0xBF, 0x7F, 0xEE, 0xBF, 0xC0, 0xEF, 0x80, 0x7F, 0xEF, 0x80, 0xC0, 0xEF, 0xBF, 0x7F, 0xEF, 0xBF, 0xC0 }, 0, 24, "\uFFFD\u007F\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD" };
+                yield return new object[] { new byte[] { 0xEE, 0x7F, 0x80, 0xEE, 0xC0, 0x80, 0xEE, 0x7F, 0xBF, 0xEE, 0xC0, 0xBF, 0xEF, 0x7F, 0x80, 0xEF, 0xC0, 0x80, 0xEF, 0x7F, 0xBF, 0xEF, 0xC0, 0xBF }, 0, 24, "\uFFFD\u007F\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD\uFFFD\uFFFD" };
+
+                yield return new object[] { new byte[] { 0xF0, 0x90, 0x80, 0x7F, 0xF0, 0x90, 0x80, 0xC0, 0xF0, 0xBF, 0xBF, 0x7F, 0xF0, 0xBF, 0xBF, 0xC0 }, 0, 16, "\uFFFD\u007F\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD" };
+                yield return new object[] { new byte[] { 0xF0, 0x90, 0x7F, 0x80, 0xF0, 0x90, 0xC0, 0x80, 0xF0, 0x90, 0x7F, 0xBF, 0xF0, 0x90, 0xC0, 0xBF }, 0, 16, "\uFFFD\u007F\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD\uFFFD\uFFFD" };
+                yield return new object[] { new byte[] { 0xF0, 0x8F, 0x80, 0x80, 0xF0, 0xC0, 0x80, 0x80, 0xF0, 0x8F, 0xBF, 0xBF, 0xF0, 0xC0, 0xBF, 0xBF }, 0, 16, "\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD" };
+
+                yield return new object[] { new byte[] { 0xF1, 0x80, 0x80, 0x7F, 0xF1, 0x80, 0x80, 0xC0, 0xF1, 0xBF, 0xBF, 0x7F, 0xF1, 0xBF, 0xBF, 0xC0, 0xF3, 0x80, 0x80, 0x7F, 0xF3, 0x80, 0x80, 0xC0, 0xF3, 0xBF, 0xBF, 0x7F, 0xF3, 0xBF, 0xBF, 0xC0 }, 0, 32, "\uFFFD\u007F\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD" };
+                yield return new object[] { new byte[] { 0xF1, 0x80, 0x7F, 0x80, 0xF1, 0x80, 0xC0, 0x80, 0xF1, 0x80, 0x7F, 0xBF, 0xF1, 0x80, 0xC0, 0xBF, 0xF3, 0x80, 0x7F, 0x80, 0xF3, 0x80, 0xC0, 0x80, 0xF3, 0x80, 0x7F, 0xBF, 0xF3, 0x80, 0xC0, 0xBF }, 0, 32, "\uFFFD\u007F\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD\uFFFD\uFFFD" };
+                yield return new object[] { new byte[] { 0xF1, 0x7F, 0x80, 0x80, 0xF1, 0xC0, 0x80, 0x80, 0xF1, 0x7F, 0xBF, 0xBF, 0xF1, 0xC0, 0xBF, 0xBF, 0xF3, 0x7F, 0x80, 0x80, 0xF3, 0xC0, 0x80, 0x80, 0xF3, 0x7F, 0xBF, 0xBF, 0xF3, 0xC0, 0xBF, 0xBF }, 0, 32, "\uFFFD\u007F\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD" };
+
+                yield return new object[] { new byte[] { 0xF4, 0x80, 0x80, 0x7F, 0xF4, 0x80, 0x80, 0xC0, 0xF4, 0x8F, 0xBF, 0x7F, 0xF4, 0x8F, 0xBF, 0xC0 }, 0, 16, "\uFFFD\u007F\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD" };
+                yield return new object[] { new byte[] { 0xF4, 0x80, 0x7F, 0x80, 0xF4, 0x80, 0xC0, 0x80, 0xF4, 0x80, 0x7F, 0xBF, 0xF4, 0x80, 0xC0, 0xBF }, 0, 16, "\uFFFD\u007F\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD\uFFFD\uFFFD" };
+                yield return new object[] { new byte[] { 0xF4, 0x7F, 0x80, 0x80, 0xF4, 0x90, 0x80, 0x80, 0xF4, 0x7F, 0xBF, 0xBF, 0xF4, 0x90, 0xBF, 0xBF }, 0, 16, "\uFFFD\u007F\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD" };
+            }
         }
 
         [Theory]
@@ -229,5 +349,32 @@ namespace System.Text.Tests
             NegativeEncodingTests.Decode_Invalid(new UTF8Encoding(false, true), bytes, index, count);
             NegativeEncodingTests.Decode_Invalid(new UTF8Encoding(true, true), bytes, index, count);
         }
+
+        [Theory]
+        [InlineData("", "ABCDEF")]
+        [InlineData("\uFFFD", "\uFFFDAB\uFFFDCD\uFFFDEF\uFFFD")]
+        [InlineData("?", "?AB?CD?EF?")]
+        [InlineData("\uFFFD?", "\uFFFD?AB\uFFFD?CD\uFFFD?EF\uFFFD?")]
+        public void Decode_InvalidChars_WithCustomReplacementFallback(string replacementString, string expected)
+        {
+            byte[] utf8Input = new byte[]
+            {
+                0xC0, // always an invalid byte
+                (byte)'A', (byte)'B',
+                0xF4, 0x80, 0xBF, // incomplete 4-byte sequence
+                (byte)'C', (byte)'D',
+                0xE0, // incomplete 3-byte sequence
+                (byte)'E', (byte)'F',
+                0xC2, // incomplete 2-byte sequence
+            };
+
+            Encoding utf8Encoding = Encoding.GetEncoding(
+                name: "utf-8",
+                encoderFallback: EncoderFallback.ExceptionFallback,
+                decoderFallback: new DecoderReplacementFallback(replacementString));
+
+            string actualUtf16Output = utf8Encoding.GetString(utf8Input); // pass in an invalid UTF-8 sequence
+            Assert.Equal(expected, actualUtf16Output);
+        }
     }
 }
index e2e7bbc..b15f4d1 100644 (file)
@@ -3,6 +3,7 @@
 // See the LICENSE file in the project root for more information.
 
 using System.Collections.Generic;
+using System.Linq;
 using Xunit;
 
 namespace System.Text.Tests
@@ -248,5 +249,24 @@ namespace System.Text.Tests
                 new UTF8Encoding(encoderShouldEmitUTF8Identifier: true, throwOnInvalidBytes: true), 
                 chars, index, count);
         }
+
+        [Theory]
+        [InlineData("", "ABCDEF")]
+        [InlineData("\uFFFD", "\uFFFDAB\uFFFDCD\uFFFDEF\uFFFD")]
+        [InlineData("?", "?AB?CD?EF?")]
+        [InlineData("\uFFFD?", "\uFFFD?AB\uFFFD?CD\uFFFD?EF\uFFFD?")]
+        public void Encode_InvalidChars_WithCustomReplacementFallback(string replacementString, string expected)
+        {
+            byte[] expectedUtf8Output = expected.SelectMany(ch => (ch == '\uFFFD') ? new byte[] { 0xEF, 0xBF, 0xBD } : new byte[] { (byte)ch }).ToArray();
+
+            Encoding utf8Encoding = Encoding.GetEncoding(
+                name: "utf-8",
+                encoderFallback: new EncoderReplacementFallback(replacementString),
+                decoderFallback: DecoderFallback.ExceptionFallback);
+
+            byte[] actualUtf8Output = utf8Encoding.GetBytes("\uD800AB\uDC00CD\uDFFFEF\uDBFF"); // pass in an invalid UTF-16 sequence
+
+            Assert.Equal(expectedUtf8Output, actualUtf8Output);
+        }
     }
 }