From 621fe7e242f18fad97aaf3dd394322fb48ac6768 Mon Sep 17 00:00:00 2001 From: Levi Broderick Date: Tue, 30 Jul 2019 10:40:47 -0700 Subject: [PATCH] Defense-in-depth: Web encoders should escape BOM U+FEFF (dotnet/corefx#39815) As a defense-in-depth mechanism, HtmlEncoder and related types should always encode the Unicode Byte Order Mark (U+FEFF), even if the caller passes a bitmap that lists this as an allowed code point. This helps provide protection for misbehaving clients which incorrectly strip the Byte Order Mark from input sequences. Commit migrated from https://github.com/dotnet/corefx/commit/9d6729a940952dcc14dc8683218684f4da29198a --- .../Text/Unicode/UnicodeHelpers.generated.cs | 2 +- .../tests/UnicodeHelpersTests.cs | 10 ++++++--- .../tools/GenDefinedCharList/Program.cs | 24 ++++++++++++++++++++-- 3 files changed, 30 insertions(+), 6 deletions(-) diff --git a/src/libraries/System.Text.Encodings.Web/src/System/Text/Unicode/UnicodeHelpers.generated.cs b/src/libraries/System.Text.Encodings.Web/src/System/Text/Unicode/UnicodeHelpers.generated.cs index 68c0f06..8f0fa18 100644 --- a/src/libraries/System.Text.Encodings.Web/src/System/Text/Unicode/UnicodeHelpers.generated.cs +++ b/src/libraries/System.Text.Encodings.Web/src/System/Text/Unicode/UnicodeHelpers.generated.cs @@ -520,7 +520,7 @@ namespace System.Text.Unicode 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, // U+FD00..U+FD7F 0xFF, 0xFF, 0xFC, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0x3F, // U+FD80..U+FDFF 0xFF, 0xFF, 0xFF, 0x03, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xF7, 0xFF, 0x7F, 0x0F, 0xDF, 0xFF, // U+FE00..U+FE7F - 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x9F, // U+FE80..U+FEFF + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x1F, // U+FE80..U+FEFF 0xFE, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, // U+FF00..U+FF7F 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x7F, 0xFC, 0xFC, 0xFC, 0x1C, 0x7F, 0x7F, 0x00, 0x3E, // U+FF80..U+FFFF }; diff --git a/src/libraries/System.Text.Encodings.Web/tests/UnicodeHelpersTests.cs b/src/libraries/System.Text.Encodings.Web/tests/UnicodeHelpersTests.cs index 1113dc8..2d8e3d1 100644 --- a/src/libraries/System.Text.Encodings.Web/tests/UnicodeHelpersTests.cs +++ b/src/libraries/System.Text.Encodings.Web/tests/UnicodeHelpersTests.cs @@ -178,6 +178,10 @@ namespace Microsoft.Framework.WebEncoders { retVal[codePoint] = true; // we allow U+0020 SPACE as our only valid Zs (whitespace) char } + else if (codePoint == 0xFEFF) + { + retVal[codePoint] = false; // we explicitly forbid U+FEFF ZERO WIDTH NO-BREAK SPACE because it's also the byte order mark (BOM) + } else { string category = splitLine[2]; @@ -186,11 +190,11 @@ namespace Microsoft.Framework.WebEncoders { retVal[codePoint] = true; // chars in this category are allowable seenCategories.Add(category); - + if (splitLine[1].EndsWith("First>")) { startSpanCodepoint = codePoint; - } + } else if (splitLine[1].EndsWith("Last>")) { for (uint spanCounter = startSpanCodepoint; spanCounter < codePoint; spanCounter++) @@ -198,7 +202,7 @@ namespace Microsoft.Framework.WebEncoders retVal[spanCounter] = true; // chars in this category are allowable } } - + } } } diff --git a/src/libraries/System.Text.Encodings.Web/tools/GenDefinedCharList/Program.cs b/src/libraries/System.Text.Encodings.Web/tools/GenDefinedCharList/Program.cs index 8ce5d6f..83d379e 100644 --- a/src/libraries/System.Text.Encodings.Web/tools/GenDefinedCharList/Program.cs +++ b/src/libraries/System.Text.Encodings.Web/tools/GenDefinedCharList/Program.cs @@ -62,9 +62,9 @@ namespace GenDefinedCharList } // We only allow certain categories of code points. - // Zs (space separators) aren't included, but we allow U+0020 SPACE as a special case + // Check for special-cased code points before querying whether the overall category is allowed. - if (!(codepoint == (uint)' ' || IsAllowedUnicodeCategory(category))) + if (!(GetIsCodePointAllowedOverride(codepoint) ?? IsAllowedUnicodeCategory(category))) { continue; } @@ -151,6 +151,26 @@ namespace GenDefinedCharList File.WriteAllText(args[1], builder.ToString()); } + private static bool? GetIsCodePointAllowedOverride(uint codepoint) + { + switch (codepoint) + { + case 0x0020: + // ' ' U+0020 SPACE is allowed, even though the Zs category (space separators) is otherwise disallowed. + return true; + + case 0xFEFF: + // U+FEFF ZERO WIDTH NO-BREAK SPACE is disallowed, even though the Cf category (format characters) is otherwise allowed. + // The reason for this is that U+FEFF is also used as the byte order mark (BOM), and some clients don't handle this + // code point correctly when it appears in the middle of a string. See https://www.unicode.org/faq/utf_bom.html#BOM. + return false; + + default: + // No override specified; fall back to whether the category itself is allowed or disallowed. + return null; + } + } + private static bool IsAllowedUnicodeCategory(string category) { // We only allow certain classes of characters -- 2.7.4