From ed921eef0bcbdf877d0f34f3bc47aa8aafcdbc81 Mon Sep 17 00:00:00 2001
From: Stephen Toub <stoub@microsoft.com>
Date: Tue, 28 Mar 2023 06:22:42 -0400
Subject: [PATCH] Use bit mask in Regex's IsWordChar (#84003)

---
 .../gen/RegexGenerator.Emitter.cs                  | 33 ++++----
 .../Text/RegularExpressions/RegexCharClass.cs      | 91 ++++++++++------------
 .../FunctionalTests/RegexGeneratorOutputTests.cs   | 26 +++----
 3 files changed, 72 insertions(+), 78 deletions(-)
diff --git a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs
index 2f855fb..53c2278 100644
--- a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs
+++ b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs
@@ -299,6 +299,17 @@ namespace System.Text.RegularExpressions.Generator
                     "[MethodImpl(MethodImplOptions.AggressiveInlining)]",
                     "internal static bool IsWordChar(char ch)",
                     "{",
+                    "    // Mask of Unicode categories that combine to form [\\w]",
+                    "    const int WordCategories =",
+                    "        1 << (int)UnicodeCategory.UppercaseLetter |",
+                    "        1 << (int)UnicodeCategory.LowercaseLetter |",
+                    "        1 << (int)UnicodeCategory.TitlecaseLetter |",
+                    "        1 << (int)UnicodeCategory.ModifierLetter |",
+                    "        1 << (int)UnicodeCategory.OtherLetter |",
+                    "        1 << (int)UnicodeCategory.NonSpacingMark |",
+                    "        1 << (int)UnicodeCategory.DecimalDigitNumber |",
+                    "        1 << (int)UnicodeCategory.ConnectorPunctuation;",
+                    "",
                     "    // Bitmap for whether each character 0 through 127 is in [\\w]",
                     "    ReadOnlySpan<byte> ascii = new byte[]",
                     "    {",
@@ -310,18 +321,7 @@ namespace System.Text.RegularExpressions.Generator
                     "    int chDiv8 = ch >> 3;",
                     "    return (uint)chDiv8 < (uint)ascii.Length ?",
                     "        (ascii[chDiv8] & (1 << (ch & 0x7))) != 0 :",
-                    "        CharUnicodeInfo.GetUnicodeCategory(ch) switch",
-                    "        {",
-                    "            UnicodeCategory.UppercaseLetter or",
-                    "            UnicodeCategory.LowercaseLetter or",
-                    "            UnicodeCategory.TitlecaseLetter or",
-                    "            UnicodeCategory.ModifierLetter or",
-                    "            UnicodeCategory.OtherLetter or",
-                    "            UnicodeCategory.NonSpacingMark or",
-                    "            UnicodeCategory.DecimalDigitNumber or",
-                    "            UnicodeCategory.ConnectorPunctuation => true,",
-                    "            _ => false,",
-                    "        };",
+                    "        (WordCategories & (1 << (int)CharUnicodeInfo.GetUnicodeCategory(ch))) != 0;",
                     "}",
                 });
             }
@@ -4770,11 +4770,16 @@ namespace System.Text.RegularExpressions.Generator
             Span<UnicodeCategory> categories = stackalloc UnicodeCategory[30]; // number of UnicodeCategory values (though it's unheard of to have a set with all of them)
             if (RegexCharClass.TryGetOnlyCategories(charClass, categories, out int numCategories, out bool negated))
             {
-                // TODO https://github.com/dotnet/roslyn/issues/58246: Use pattern matching instead of switch once C# code gen quality improves.
+                int categoryMask = 0;
+                foreach (UnicodeCategory category in categories.Slice(0, numCategories))
+                {
+                    categoryMask |= 1 << (int)category;
+                }
+
                 negate ^= negated;
                 return numCategories == 1 ?
                     $"(char.GetUnicodeCategory({chExpr}) {(negate ? "!=" : "==")} UnicodeCategory.{categories[0]})" :
-                    $"(char.GetUnicodeCategory({chExpr}) switch {{ {string.Join(" or ", categories.Slice(0, numCategories).ToArray().Select(c => $"UnicodeCategory.{c}"))} => {(negate ? "false" : "true")}, _ => {(negate ? "true" : "false")} }})";
+                    $"((0x{categoryMask:X} & (1 << (int)char.GetUnicodeCategory({chExpr}))) {(negate ? "==" : "!=")} 0)";
             }
 
             // Next, if there's only 2 or 3 chars in the set (fairly common due to the sets we create for prefixes),
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs
index c90f163..cc3611a 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs
@@ -1148,31 +1148,25 @@ namespace System.Text.RegularExpressions
             // This is the same as IsBoundaryWordChar, except that IsBoundaryWordChar also
             // returns true for \u200c and \u200d.
 
-            // Fast lookup in our lookup table for ASCII characters.  This is purely an optimization, and has the
-            // behavior as if we fell through to the switch below (which was actually used to produce the lookup table).
-            ReadOnlySpan<byte> asciiLookup = WordCharAsciiLookup;
-            int chDiv8 = ch >> 3;
-            if ((uint)chDiv8 < (uint)asciiLookup.Length)
-            {
-                return (asciiLookup[chDiv8] & (1 << (ch & 0x7))) != 0;
-            }
-
-            // For non-ASCII, fall back to checking the Unicode category.
-            switch (CharUnicodeInfo.GetUnicodeCategory(ch))
-            {
-                case UnicodeCategory.UppercaseLetter:
-                case UnicodeCategory.LowercaseLetter:
-                case UnicodeCategory.TitlecaseLetter:
-                case UnicodeCategory.ModifierLetter:
-                case UnicodeCategory.OtherLetter:
-                case UnicodeCategory.NonSpacingMark:
-                case UnicodeCategory.DecimalDigitNumber:
-                case UnicodeCategory.ConnectorPunctuation:
-                    return true;
-
-                default:
-                    return false;
-            }
+           // Mask of Unicode categories that combine to form [\\w]"
+           const int WordCategories =
+               1 << (int)UnicodeCategory.UppercaseLetter |
+               1 << (int)UnicodeCategory.LowercaseLetter |
+               1 << (int)UnicodeCategory.TitlecaseLetter |
+               1 << (int)UnicodeCategory.ModifierLetter |
+               1 << (int)UnicodeCategory.OtherLetter |
+               1 << (int)UnicodeCategory.NonSpacingMark |
+               1 << (int)UnicodeCategory.DecimalDigitNumber |
+               1 << (int)UnicodeCategory.ConnectorPunctuation;
+
+           // Bitmap for whether each character 0 through 127 is in [\\w]",
+           ReadOnlySpan<byte> ascii = WordCharAsciiLookup;
+
+           // If the char is ASCII, look it up in the bitmap. Otherwise, query its Unicode category.",
+           int chDiv8 = ch >> 3;
+           return (uint)chDiv8 < (uint)ascii.Length ?
+               (ascii[chDiv8] & (1 << (ch & 0x7))) != 0 :
+               (WordCategories & (1 << (int)CharUnicodeInfo.GetUnicodeCategory(ch))) != 0;
         }
 
         /// <summary>Determines whether a character is considered a word character for the purposes of testing a word character boundary.</summary>
@@ -1182,33 +1176,28 @@ namespace System.Text.RegularExpressions
             // RL 1.4 Simple Word Boundaries  The class of <word_character> includes all Alphabetic
             // values from the Unicode character database, from UnicodeData.txt [UData], plus the U+200C
             // ZERO WIDTH NON-JOINER and U+200D ZERO WIDTH JOINER.
-
-            // Fast lookup in our lookup table for ASCII characters.  This is purely an optimization, and has the
-            // behavior as if we fell through to the switch below (which was actually used to produce the lookup table).
-            ReadOnlySpan<byte> asciiLookup = WordCharAsciiLookup;
+            const char ZeroWidthNonJoiner = '\u200C', ZeroWidthJoiner = '\u200D';
+
+            // Mask of Unicode categories that combine to form [\\w]"
+            const int WordCategories =
+                1 << (int)UnicodeCategory.UppercaseLetter |
+                1 << (int)UnicodeCategory.LowercaseLetter |
+                1 << (int)UnicodeCategory.TitlecaseLetter |
+                1 << (int)UnicodeCategory.ModifierLetter |
+                1 << (int)UnicodeCategory.OtherLetter |
+                1 << (int)UnicodeCategory.NonSpacingMark |
+                1 << (int)UnicodeCategory.DecimalDigitNumber |
+                1 << (int)UnicodeCategory.ConnectorPunctuation;
+
+            // Bitmap for whether each character 0 through 127 is in [\\w]",
+            ReadOnlySpan<byte> ascii = WordCharAsciiLookup;
+
+            // If the char is ASCII, look it up in the bitmap. Otherwise, query its Unicode category.",
             int chDiv8 = ch >> 3;
-            if ((uint)chDiv8 < (uint)asciiLookup.Length)
-            {
-                return (asciiLookup[chDiv8] & (1 << (ch & 0x7))) != 0;
-            }
-
-            // For non-ASCII, fall back to checking the Unicode category.
-            switch (CharUnicodeInfo.GetUnicodeCategory(ch))
-            {
-                case UnicodeCategory.UppercaseLetter:
-                case UnicodeCategory.LowercaseLetter:
-                case UnicodeCategory.TitlecaseLetter:
-                case UnicodeCategory.ModifierLetter:
-                case UnicodeCategory.OtherLetter:
-                case UnicodeCategory.NonSpacingMark:
-                case UnicodeCategory.DecimalDigitNumber:
-                case UnicodeCategory.ConnectorPunctuation:
-                    return true;
-
-                default:
-                    const char ZeroWidthNonJoiner = '\u200C', ZeroWidthJoiner = '\u200D';
-                    return ch == ZeroWidthJoiner | ch == ZeroWidthNonJoiner;
-            }
+            return (uint)chDiv8 < (uint)ascii.Length ?
+                (ascii[chDiv8] & (1 << (ch & 0x7))) != 0 :
+                ((WordCategories & (1 << (int)CharUnicodeInfo.GetUnicodeCategory(ch))) != 0 ||
+                 (ch == ZeroWidthJoiner | ch == ZeroWidthNonJoiner));
         }
 
         /// <summary>Determines whether the 'a' and 'b' values differ by only a single bit, setting that bit in 'mask'.</summary>
diff --git a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/RegexGeneratorOutputTests.cs b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/RegexGeneratorOutputTests.cs
index 4a770a3..34e5e7f 100644
--- a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/RegexGeneratorOutputTests.cs
+++ b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/RegexGeneratorOutputTests.cs
@@ -354,29 +354,29 @@ namespace System.Text.RegularExpressions.Tests
                         [MethodImpl(MethodImplOptions.AggressiveInlining)]
                         internal static bool IsWordChar(char ch)
                         {
+                            // Mask of Unicode categories that combine to form [\w]
+                            const int WordCategories =
+                                1 << (int)UnicodeCategory.UppercaseLetter |
+                                1 << (int)UnicodeCategory.LowercaseLetter |
+                                1 << (int)UnicodeCategory.TitlecaseLetter |
+                                1 << (int)UnicodeCategory.ModifierLetter |
+                                1 << (int)UnicodeCategory.OtherLetter |
+                                1 << (int)UnicodeCategory.NonSpacingMark |
+                                1 << (int)UnicodeCategory.DecimalDigitNumber |
+                                1 << (int)UnicodeCategory.ConnectorPunctuation;
+                            
                             // Bitmap for whether each character 0 through 127 is in [\w]
                             ReadOnlySpan<byte> ascii = new byte[]
                             {
                                 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0x03,
                                 0xFE, 0xFF, 0xFF, 0x87, 0xFE, 0xFF, 0xFF, 0x07
                             };
-
+                            
                             // If the char is ASCII, look it up in the bitmap. Otherwise, query its Unicode category.
                             int chDiv8 = ch >> 3;
                             return (uint)chDiv8 < (uint)ascii.Length ?
                                 (ascii[chDiv8] & (1 << (ch & 0x7))) != 0 :
-                                CharUnicodeInfo.GetUnicodeCategory(ch) switch
-                                {
-                                    UnicodeCategory.UppercaseLetter or
-                                    UnicodeCategory.LowercaseLetter or
-                                    UnicodeCategory.TitlecaseLetter or
-                                    UnicodeCategory.ModifierLetter or
-                                    UnicodeCategory.OtherLetter or
-                                    UnicodeCategory.NonSpacingMark or
-                                    UnicodeCategory.DecimalDigitNumber or
-                                    UnicodeCategory.ConnectorPunctuation => true,
-                                    _ => false,
-                                };
+                                (WordCategories & (1 << (int)CharUnicodeInfo.GetUnicodeCategory(ch))) != 0;
                         }
 
                         /// <summary>Pushes 2 values onto the backtracking stack.</summary>
-- 
2.7.4