Fix HtmlEncode handling of surrogate pairs (dotnet/corefx#41576)
authorStephen Toub <stoub@microsoft.com>
Sun, 6 Oct 2019 02:33:18 +0000 (22:33 -0400)
committerGitHub <noreply@github.com>
Sun, 6 Oct 2019 02:33:18 +0000 (22:33 -0400)
Due to a regression that came as part of changing the code from using pointers to using spans, when the surrogate pair isn't at the beginning of the the input, it's incorrectly encoded.

Commit migrated from https://github.com/dotnet/corefx/commit/6de42378223c6cb138d28f812c440092718c7a19

src/libraries/System.Runtime.Extensions/src/System/Net/WebUtility.cs
src/libraries/System.Runtime.Extensions/tests/System/Net/WebUtility.cs

index 998f142..b072ed8 100644 (file)
@@ -639,17 +639,17 @@ namespace System.Net
         private static int GetNextUnicodeScalarValueFromUtf16Surrogate(ReadOnlySpan<char> input, ref int index)
         {
             // invariants
-            Debug.Assert(input.Length >= 1);
-            Debug.Assert(char.IsSurrogate(input[0]));
+            Debug.Assert(input.Length - index >= 1);
+            Debug.Assert(char.IsSurrogate(input[index]));
 
-            if (input.Length <= 1)
+            if (input.Length - index <= 1)
             {
                 // not enough characters remaining to resurrect the original scalar value
                 return UnicodeReplacementChar;
             }
 
-            char leadingSurrogate = input[0];
-            char trailingSurrogate = input[1];
+            char leadingSurrogate = input[index];
+            char trailingSurrogate = input[index + 1];
 
             if (!char.IsSurrogatePair(leadingSurrogate, trailingSurrogate))
             {
index ffba432..93c082a 100644 (file)
@@ -83,6 +83,8 @@ namespace System.Net.Tests
             yield return new object[] { char.ConvertFromUtf32(144308), "&#144308;" };
             yield return new object[] { "\uD800\uDC00", "&#65536;" };
             yield return new object[] { "a\uD800\uDC00b", "a&#65536;b" };
+            yield return new object[] { "\uD83D\uDE01\uD83D\uDE02\uD83D\uDE03", "&#128513;&#128514;&#128515;" };
+            yield return new object[] { "a\uD83D\uDE01\uD83D\uDE02\uD83D\uDE03b", "a&#128513;&#128514;&#128515;b" };
 
             // High BMP non-chars
             yield return new object[] { "\uFFFD", "\uFFFD" };