Defense-in-depth: Web encoders should escape BOM U+FEFF (dotnet/corefx#39815)

author Levi Broderick <GrabYourPitchforks@users.noreply.github.com>

Tue, 30 Jul 2019 17:40:47 +0000 (10:40 -0700)

committer GitHub <noreply@github.com>

Tue, 30 Jul 2019 17:40:47 +0000 (10:40 -0700)
author Levi Broderick <GrabYourPitchforks@users.noreply.github.com>
Tue, 30 Jul 2019 17:40:47 +0000 (10:40 -0700)
committer GitHub <noreply@github.com>
Tue, 30 Jul 2019 17:40:47 +0000 (10:40 -0700)
diff --git a/src/libraries/System.Text.Encodings.Web/src/System/Text/Unicode/UnicodeHelpers.generated.cs b/src/libraries/System.Text.Encodings.Web/src/System/Text/Unicode/UnicodeHelpers.generated.cs

index 68c0f06..8f0fa18 100644 (file)
--- a/src/libraries/System.Text.Encodings.Web/src/System/Text/Unicode/UnicodeHelpers.generated.cs
+++ b/src/libraries/System.Text.Encodings.Web/src/System/Text/Unicode/UnicodeHelpers.generated.cs
@@ -520,7 +520,7 @@ namespace System.Text.Unicode
              0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, // U+FD00..U+FD7F
              0xFF, 0xFF, 0xFC, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0x3F, // U+FD80..U+FDFF
              0xFF, 0xFF, 0xFF, 0x03, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xF7, 0xFF, 0x7F, 0x0F, 0xDF, 0xFF, // U+FE00..U+FE7F
-            0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x9F, // U+FE80..U+FEFF
+            0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x1F, // U+FE80..U+FEFF
              0xFE, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, // U+FF00..U+FF7F
              0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x7F, 0xFC, 0xFC, 0xFC, 0x1C, 0x7F, 0x7F, 0x00, 0x3E, // U+FF80..U+FFFF
          };
diff --git a/src/libraries/System.Text.Encodings.Web/tests/UnicodeHelpersTests.cs b/src/libraries/System.Text.Encodings.Web/tests/UnicodeHelpersTests.cs

index 1113dc8..2d8e3d1 100644 (file)
--- a/src/libraries/System.Text.Encodings.Web/tests/UnicodeHelpersTests.cs
+++ b/src/libraries/System.Text.Encodings.Web/tests/UnicodeHelpersTests.cs
@@ -178,6 +178,10 @@ namespace Microsoft.Framework.WebEncoders
                  {
                      retVal[codePoint] = true; // we allow U+0020 SPACE as our only valid Zs (whitespace) char
                  }
+                else if (codePoint == 0xFEFF)
+                {
+                    retVal[codePoint] = false; // we explicitly forbid U+FEFF ZERO WIDTH NO-BREAK SPACE because it's also the byte order mark (BOM)
+                }
                  else
                  {
                      string category = splitLine[2];
@@ -186,11 +190,11 @@ namespace Microsoft.Framework.WebEncoders
                      {
                          retVal[codePoint] = true; // chars in this category are allowable
                          seenCategories.Add(category);
-                        
+
                          if (splitLine[1].EndsWith("First>"))
                          {
                              startSpanCodepoint = codePoint;
-                        } 
+                        }
                          else if (splitLine[1].EndsWith("Last>"))
                          {
                              for (uint spanCounter = startSpanCodepoint; spanCounter < codePoint; spanCounter++)
@@ -198,7 +202,7 @@ namespace Microsoft.Framework.WebEncoders
                                  retVal[spanCounter] = true; // chars in this category are allowable
                              }
                          }
-                        
+
                      }
                  }
              }
diff --git a/src/libraries/System.Text.Encodings.Web/tools/GenDefinedCharList/Program.cs b/src/libraries/System.Text.Encodings.Web/tools/GenDefinedCharList/Program.cs

index 8ce5d6f..83d379e 100644 (file)
--- a/src/libraries/System.Text.Encodings.Web/tools/GenDefinedCharList/Program.cs
+++ b/src/libraries/System.Text.Encodings.Web/tools/GenDefinedCharList/Program.cs
@@ -62,9 +62,9 @@ namespace GenDefinedCharList
                  }
  
                  // We only allow certain categories of code points.
-                // Zs (space separators) aren't included, but we allow U+0020 SPACE as a special case
+                // Check for special-cased code points before querying whether the overall category is allowed.
  
-                if (!(codepoint == (uint)' ' || IsAllowedUnicodeCategory(category)))
+                if (!(GetIsCodePointAllowedOverride(codepoint) ?? IsAllowedUnicodeCategory(category)))
                  {
                      continue;
                  }
@@ -151,6 +151,26 @@ namespace GenDefinedCharList
              File.WriteAllText(args[1], builder.ToString());
          }
  
+        private static bool? GetIsCodePointAllowedOverride(uint codepoint)
+        {
+            switch (codepoint)
+            {
+                case 0x0020:
+                    // ' ' U+0020 SPACE is allowed, even though the Zs category (space separators) is otherwise disallowed.
+                    return true;
+
+                case 0xFEFF:
+                    // U+FEFF ZERO WIDTH NO-BREAK SPACE is disallowed, even though the Cf category (format characters) is otherwise allowed.
+                    // The reason for this is that U+FEFF is also used as the byte order mark (BOM), and some clients don't handle this
+                    // code point correctly when it appears in the middle of a string. See https://www.unicode.org/faq/utf_bom.html#BOM.
+                    return false;
+
+                default:
+                    // No override specified; fall back to whether the category itself is allowed or disallowed.
+                    return null;
+            }
+        }
+
          private static bool IsAllowedUnicodeCategory(string category)
          {
              // We only allow certain classes of characters
author	Levi Broderick <GrabYourPitchforks@users.noreply.github.com>
	Tue, 30 Jul 2019 17:40:47 +0000 (10:40 -0700)
committer	GitHub <noreply@github.com>
	Tue, 30 Jul 2019 17:40:47 +0000 (10:40 -0700)
src/libraries/System.Text.Encodings.Web/src/System/Text/Unicode/UnicodeHelpers.generated.cs		patch \| blob \| history
src/libraries/System.Text.Encodings.Web/tests/UnicodeHelpersTests.cs		patch \| blob \| history
src/libraries/System.Text.Encodings.Web/tools/GenDefinedCharList/Program.cs		patch \| blob \| history