0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, // U+FD00..U+FD7F
0xFF, 0xFF, 0xFC, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0x3F, // U+FD80..U+FDFF
0xFF, 0xFF, 0xFF, 0x03, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xF7, 0xFF, 0x7F, 0x0F, 0xDF, 0xFF, // U+FE00..U+FE7F
- 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x9F, // U+FE80..U+FEFF
+ 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x1F, // U+FE80..U+FEFF
0xFE, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, // U+FF00..U+FF7F
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x7F, 0xFC, 0xFC, 0xFC, 0x1C, 0x7F, 0x7F, 0x00, 0x3E, // U+FF80..U+FFFF
};
{
retVal[codePoint] = true; // we allow U+0020 SPACE as our only valid Zs (whitespace) char
}
+ else if (codePoint == 0xFEFF)
+ {
+ retVal[codePoint] = false; // we explicitly forbid U+FEFF ZERO WIDTH NO-BREAK SPACE because it's also the byte order mark (BOM)
+ }
else
{
string category = splitLine[2];
{
retVal[codePoint] = true; // chars in this category are allowable
seenCategories.Add(category);
-
+
if (splitLine[1].EndsWith("First>"))
{
startSpanCodepoint = codePoint;
- }
+ }
else if (splitLine[1].EndsWith("Last>"))
{
for (uint spanCounter = startSpanCodepoint; spanCounter < codePoint; spanCounter++)
retVal[spanCounter] = true; // chars in this category are allowable
}
}
-
+
}
}
}
}
// We only allow certain categories of code points.
- // Zs (space separators) aren't included, but we allow U+0020 SPACE as a special case
+ // Check for special-cased code points before querying whether the overall category is allowed.
- if (!(codepoint == (uint)' ' || IsAllowedUnicodeCategory(category)))
+ if (!(GetIsCodePointAllowedOverride(codepoint) ?? IsAllowedUnicodeCategory(category)))
{
continue;
}
File.WriteAllText(args[1], builder.ToString());
}
+ private static bool? GetIsCodePointAllowedOverride(uint codepoint)
+ {
+ switch (codepoint)
+ {
+ case 0x0020:
+ // ' ' U+0020 SPACE is allowed, even though the Zs category (space separators) is otherwise disallowed.
+ return true;
+
+ case 0xFEFF:
+ // U+FEFF ZERO WIDTH NO-BREAK SPACE is disallowed, even though the Cf category (format characters) is otherwise allowed.
+ // The reason for this is that U+FEFF is also used as the byte order mark (BOM), and some clients don't handle this
+ // code point correctly when it appears in the middle of a string. See https://www.unicode.org/faq/utf_bom.html#BOM.
+ return false;
+
+ default:
+ // No override specified; fall back to whether the category itself is allowed or disallowed.
+ return null;
+ }
+ }
+
private static bool IsAllowedUnicodeCategory(string category)
{
// We only allow certain classes of characters