From 621fe7e242f18fad97aaf3dd394322fb48ac6768 Mon Sep 17 00:00:00 2001
From: Levi Broderick <GrabYourPitchforks@users.noreply.github.com>
Date: Tue, 30 Jul 2019 10:40:47 -0700
Subject: [PATCH] Defense-in-depth: Web encoders should escape BOM U+FEFF
 (dotnet/corefx#39815)

As a defense-in-depth mechanism, HtmlEncoder and related types should always encode the Unicode Byte Order Mark (U+FEFF), even if the caller passes a bitmap that lists this as an allowed code point. This helps provide protection for misbehaving clients which incorrectly strip the Byte Order Mark from input sequences.

Commit migrated from https://github.com/dotnet/corefx/commit/9d6729a940952dcc14dc8683218684f4da29198a
---
 .../Text/Unicode/UnicodeHelpers.generated.cs       |  2 +-
 .../tests/UnicodeHelpersTests.cs                   | 10 ++++++---
 .../tools/GenDefinedCharList/Program.cs            | 24 ++++++++++++++++++++--
 3 files changed, 30 insertions(+), 6 deletions(-)

diff --git a/src/libraries/System.Text.Encodings.Web/src/System/Text/Unicode/UnicodeHelpers.generated.cs b/src/libraries/System.Text.Encodings.Web/src/System/Text/Unicode/UnicodeHelpers.generated.cs
index 68c0f06..8f0fa18 100644
--- a/src/libraries/System.Text.Encodings.Web/src/System/Text/Unicode/UnicodeHelpers.generated.cs
+++ b/src/libraries/System.Text.Encodings.Web/src/System/Text/Unicode/UnicodeHelpers.generated.cs
@@ -520,7 +520,7 @@ namespace System.Text.Unicode
             0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, // U+FD00..U+FD7F
             0xFF, 0xFF, 0xFC, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0x3F, // U+FD80..U+FDFF
             0xFF, 0xFF, 0xFF, 0x03, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xF7, 0xFF, 0x7F, 0x0F, 0xDF, 0xFF, // U+FE00..U+FE7F
-            0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x9F, // U+FE80..U+FEFF
+            0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x1F, // U+FE80..U+FEFF
             0xFE, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, // U+FF00..U+FF7F
             0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x7F, 0xFC, 0xFC, 0xFC, 0x1C, 0x7F, 0x7F, 0x00, 0x3E, // U+FF80..U+FFFF
         };
diff --git a/src/libraries/System.Text.Encodings.Web/tests/UnicodeHelpersTests.cs b/src/libraries/System.Text.Encodings.Web/tests/UnicodeHelpersTests.cs
index 1113dc8..2d8e3d1 100644
--- a/src/libraries/System.Text.Encodings.Web/tests/UnicodeHelpersTests.cs
+++ b/src/libraries/System.Text.Encodings.Web/tests/UnicodeHelpersTests.cs
@@ -178,6 +178,10 @@ namespace Microsoft.Framework.WebEncoders
                 {
                     retVal[codePoint] = true; // we allow U+0020 SPACE as our only valid Zs (whitespace) char
                 }
+                else if (codePoint == 0xFEFF)
+                {
+                    retVal[codePoint] = false; // we explicitly forbid U+FEFF ZERO WIDTH NO-BREAK SPACE because it's also the byte order mark (BOM)
+                }
                 else
                 {
                     string category = splitLine[2];
@@ -186,11 +190,11 @@ namespace Microsoft.Framework.WebEncoders
                     {
                         retVal[codePoint] = true; // chars in this category are allowable
                         seenCategories.Add(category);
-                        
+
                         if (splitLine[1].EndsWith("First>"))
                         {
                             startSpanCodepoint = codePoint;
-                        } 
+                        }
                         else if (splitLine[1].EndsWith("Last>"))
                         {
                             for (uint spanCounter = startSpanCodepoint; spanCounter < codePoint; spanCounter++)
@@ -198,7 +202,7 @@ namespace Microsoft.Framework.WebEncoders
                                 retVal[spanCounter] = true; // chars in this category are allowable
                             }
                         }
-                        
+
                     }
                 }
             }
diff --git a/src/libraries/System.Text.Encodings.Web/tools/GenDefinedCharList/Program.cs b/src/libraries/System.Text.Encodings.Web/tools/GenDefinedCharList/Program.cs
index 8ce5d6f..83d379e 100644
--- a/src/libraries/System.Text.Encodings.Web/tools/GenDefinedCharList/Program.cs
+++ b/src/libraries/System.Text.Encodings.Web/tools/GenDefinedCharList/Program.cs
@@ -62,9 +62,9 @@ namespace GenDefinedCharList
                 }
 
                 // We only allow certain categories of code points.
-                // Zs (space separators) aren't included, but we allow U+0020 SPACE as a special case
+                // Check for special-cased code points before querying whether the overall category is allowed.
 
-                if (!(codepoint == (uint)' ' || IsAllowedUnicodeCategory(category)))
+                if (!(GetIsCodePointAllowedOverride(codepoint) ?? IsAllowedUnicodeCategory(category)))
                 {
                     continue;
                 }
@@ -151,6 +151,26 @@ namespace GenDefinedCharList
             File.WriteAllText(args[1], builder.ToString());
         }
 
+        private static bool? GetIsCodePointAllowedOverride(uint codepoint)
+        {
+            switch (codepoint)
+            {
+                case 0x0020:
+                    // ' ' U+0020 SPACE is allowed, even though the Zs category (space separators) is otherwise disallowed.
+                    return true;
+
+                case 0xFEFF:
+                    // U+FEFF ZERO WIDTH NO-BREAK SPACE is disallowed, even though the Cf category (format characters) is otherwise allowed.
+                    // The reason for this is that U+FEFF is also used as the byte order mark (BOM), and some clients don't handle this
+                    // code point correctly when it appears in the middle of a string. See https://www.unicode.org/faq/utf_bom.html#BOM.
+                    return false;
+
+                default:
+                    // No override specified; fall back to whether the category itself is allowed or disallowed.
+                    return null;
+            }
+        }
+
         private static bool IsAllowedUnicodeCategory(string category)
         {
             // We only allow certain classes of characters
-- 
2.7.4