From ed921eef0bcbdf877d0f34f3bc47aa8aafcdbc81 Mon Sep 17 00:00:00 2001 From: Stephen Toub Date: Tue, 28 Mar 2023 06:22:42 -0400 Subject: [PATCH] Use bit mask in Regex's IsWordChar (#84003) --- .../gen/RegexGenerator.Emitter.cs | 33 ++++---- .../Text/RegularExpressions/RegexCharClass.cs | 91 ++++++++++------------ .../FunctionalTests/RegexGeneratorOutputTests.cs | 26 +++---- 3 files changed, 72 insertions(+), 78 deletions(-) diff --git a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs index 2f855fb..53c2278 100644 --- a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs +++ b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs @@ -299,6 +299,17 @@ namespace System.Text.RegularExpressions.Generator "[MethodImpl(MethodImplOptions.AggressiveInlining)]", "internal static bool IsWordChar(char ch)", "{", + " // Mask of Unicode categories that combine to form [\\w]", + " const int WordCategories =", + " 1 << (int)UnicodeCategory.UppercaseLetter |", + " 1 << (int)UnicodeCategory.LowercaseLetter |", + " 1 << (int)UnicodeCategory.TitlecaseLetter |", + " 1 << (int)UnicodeCategory.ModifierLetter |", + " 1 << (int)UnicodeCategory.OtherLetter |", + " 1 << (int)UnicodeCategory.NonSpacingMark |", + " 1 << (int)UnicodeCategory.DecimalDigitNumber |", + " 1 << (int)UnicodeCategory.ConnectorPunctuation;", + "", " // Bitmap for whether each character 0 through 127 is in [\\w]", " ReadOnlySpan ascii = new byte[]", " {", @@ -310,18 +321,7 @@ namespace System.Text.RegularExpressions.Generator " int chDiv8 = ch >> 3;", " return (uint)chDiv8 < (uint)ascii.Length ?", " (ascii[chDiv8] & (1 << (ch & 0x7))) != 0 :", - " CharUnicodeInfo.GetUnicodeCategory(ch) switch", - " {", - " UnicodeCategory.UppercaseLetter or", - " UnicodeCategory.LowercaseLetter or", - " UnicodeCategory.TitlecaseLetter or", - " UnicodeCategory.ModifierLetter or", - " UnicodeCategory.OtherLetter or", - " UnicodeCategory.NonSpacingMark or", - " UnicodeCategory.DecimalDigitNumber or", - " UnicodeCategory.ConnectorPunctuation => true,", - " _ => false,", - " };", + " (WordCategories & (1 << (int)CharUnicodeInfo.GetUnicodeCategory(ch))) != 0;", "}", }); } @@ -4770,11 +4770,16 @@ namespace System.Text.RegularExpressions.Generator Span categories = stackalloc UnicodeCategory[30]; // number of UnicodeCategory values (though it's unheard of to have a set with all of them) if (RegexCharClass.TryGetOnlyCategories(charClass, categories, out int numCategories, out bool negated)) { - // TODO https://github.com/dotnet/roslyn/issues/58246: Use pattern matching instead of switch once C# code gen quality improves. + int categoryMask = 0; + foreach (UnicodeCategory category in categories.Slice(0, numCategories)) + { + categoryMask |= 1 << (int)category; + } + negate ^= negated; return numCategories == 1 ? $"(char.GetUnicodeCategory({chExpr}) {(negate ? "!=" : "==")} UnicodeCategory.{categories[0]})" : - $"(char.GetUnicodeCategory({chExpr}) switch {{ {string.Join(" or ", categories.Slice(0, numCategories).ToArray().Select(c => $"UnicodeCategory.{c}"))} => {(negate ? "false" : "true")}, _ => {(negate ? "true" : "false")} }})"; + $"((0x{categoryMask:X} & (1 << (int)char.GetUnicodeCategory({chExpr}))) {(negate ? "==" : "!=")} 0)"; } // Next, if there's only 2 or 3 chars in the set (fairly common due to the sets we create for prefixes), diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs index c90f163..cc3611a 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs @@ -1148,31 +1148,25 @@ namespace System.Text.RegularExpressions // This is the same as IsBoundaryWordChar, except that IsBoundaryWordChar also // returns true for \u200c and \u200d. - // Fast lookup in our lookup table for ASCII characters. This is purely an optimization, and has the - // behavior as if we fell through to the switch below (which was actually used to produce the lookup table). - ReadOnlySpan asciiLookup = WordCharAsciiLookup; - int chDiv8 = ch >> 3; - if ((uint)chDiv8 < (uint)asciiLookup.Length) - { - return (asciiLookup[chDiv8] & (1 << (ch & 0x7))) != 0; - } - - // For non-ASCII, fall back to checking the Unicode category. - switch (CharUnicodeInfo.GetUnicodeCategory(ch)) - { - case UnicodeCategory.UppercaseLetter: - case UnicodeCategory.LowercaseLetter: - case UnicodeCategory.TitlecaseLetter: - case UnicodeCategory.ModifierLetter: - case UnicodeCategory.OtherLetter: - case UnicodeCategory.NonSpacingMark: - case UnicodeCategory.DecimalDigitNumber: - case UnicodeCategory.ConnectorPunctuation: - return true; - - default: - return false; - } + // Mask of Unicode categories that combine to form [\\w]" + const int WordCategories = + 1 << (int)UnicodeCategory.UppercaseLetter | + 1 << (int)UnicodeCategory.LowercaseLetter | + 1 << (int)UnicodeCategory.TitlecaseLetter | + 1 << (int)UnicodeCategory.ModifierLetter | + 1 << (int)UnicodeCategory.OtherLetter | + 1 << (int)UnicodeCategory.NonSpacingMark | + 1 << (int)UnicodeCategory.DecimalDigitNumber | + 1 << (int)UnicodeCategory.ConnectorPunctuation; + + // Bitmap for whether each character 0 through 127 is in [\\w]", + ReadOnlySpan ascii = WordCharAsciiLookup; + + // If the char is ASCII, look it up in the bitmap. Otherwise, query its Unicode category.", + int chDiv8 = ch >> 3; + return (uint)chDiv8 < (uint)ascii.Length ? + (ascii[chDiv8] & (1 << (ch & 0x7))) != 0 : + (WordCategories & (1 << (int)CharUnicodeInfo.GetUnicodeCategory(ch))) != 0; } /// Determines whether a character is considered a word character for the purposes of testing a word character boundary. @@ -1182,33 +1176,28 @@ namespace System.Text.RegularExpressions // RL 1.4 Simple Word Boundaries The class of includes all Alphabetic // values from the Unicode character database, from UnicodeData.txt [UData], plus the U+200C // ZERO WIDTH NON-JOINER and U+200D ZERO WIDTH JOINER. - - // Fast lookup in our lookup table for ASCII characters. This is purely an optimization, and has the - // behavior as if we fell through to the switch below (which was actually used to produce the lookup table). - ReadOnlySpan asciiLookup = WordCharAsciiLookup; + const char ZeroWidthNonJoiner = '\u200C', ZeroWidthJoiner = '\u200D'; + + // Mask of Unicode categories that combine to form [\\w]" + const int WordCategories = + 1 << (int)UnicodeCategory.UppercaseLetter | + 1 << (int)UnicodeCategory.LowercaseLetter | + 1 << (int)UnicodeCategory.TitlecaseLetter | + 1 << (int)UnicodeCategory.ModifierLetter | + 1 << (int)UnicodeCategory.OtherLetter | + 1 << (int)UnicodeCategory.NonSpacingMark | + 1 << (int)UnicodeCategory.DecimalDigitNumber | + 1 << (int)UnicodeCategory.ConnectorPunctuation; + + // Bitmap for whether each character 0 through 127 is in [\\w]", + ReadOnlySpan ascii = WordCharAsciiLookup; + + // If the char is ASCII, look it up in the bitmap. Otherwise, query its Unicode category.", int chDiv8 = ch >> 3; - if ((uint)chDiv8 < (uint)asciiLookup.Length) - { - return (asciiLookup[chDiv8] & (1 << (ch & 0x7))) != 0; - } - - // For non-ASCII, fall back to checking the Unicode category. - switch (CharUnicodeInfo.GetUnicodeCategory(ch)) - { - case UnicodeCategory.UppercaseLetter: - case UnicodeCategory.LowercaseLetter: - case UnicodeCategory.TitlecaseLetter: - case UnicodeCategory.ModifierLetter: - case UnicodeCategory.OtherLetter: - case UnicodeCategory.NonSpacingMark: - case UnicodeCategory.DecimalDigitNumber: - case UnicodeCategory.ConnectorPunctuation: - return true; - - default: - const char ZeroWidthNonJoiner = '\u200C', ZeroWidthJoiner = '\u200D'; - return ch == ZeroWidthJoiner | ch == ZeroWidthNonJoiner; - } + return (uint)chDiv8 < (uint)ascii.Length ? + (ascii[chDiv8] & (1 << (ch & 0x7))) != 0 : + ((WordCategories & (1 << (int)CharUnicodeInfo.GetUnicodeCategory(ch))) != 0 || + (ch == ZeroWidthJoiner | ch == ZeroWidthNonJoiner)); } /// Determines whether the 'a' and 'b' values differ by only a single bit, setting that bit in 'mask'. diff --git a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/RegexGeneratorOutputTests.cs b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/RegexGeneratorOutputTests.cs index 4a770a3..34e5e7f 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/RegexGeneratorOutputTests.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/RegexGeneratorOutputTests.cs @@ -354,29 +354,29 @@ namespace System.Text.RegularExpressions.Tests [MethodImpl(MethodImplOptions.AggressiveInlining)] internal static bool IsWordChar(char ch) { + // Mask of Unicode categories that combine to form [\w] + const int WordCategories = + 1 << (int)UnicodeCategory.UppercaseLetter | + 1 << (int)UnicodeCategory.LowercaseLetter | + 1 << (int)UnicodeCategory.TitlecaseLetter | + 1 << (int)UnicodeCategory.ModifierLetter | + 1 << (int)UnicodeCategory.OtherLetter | + 1 << (int)UnicodeCategory.NonSpacingMark | + 1 << (int)UnicodeCategory.DecimalDigitNumber | + 1 << (int)UnicodeCategory.ConnectorPunctuation; + // Bitmap for whether each character 0 through 127 is in [\w] ReadOnlySpan ascii = new byte[] { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0x03, 0xFE, 0xFF, 0xFF, 0x87, 0xFE, 0xFF, 0xFF, 0x07 }; - + // If the char is ASCII, look it up in the bitmap. Otherwise, query its Unicode category. int chDiv8 = ch >> 3; return (uint)chDiv8 < (uint)ascii.Length ? (ascii[chDiv8] & (1 << (ch & 0x7))) != 0 : - CharUnicodeInfo.GetUnicodeCategory(ch) switch - { - UnicodeCategory.UppercaseLetter or - UnicodeCategory.LowercaseLetter or - UnicodeCategory.TitlecaseLetter or - UnicodeCategory.ModifierLetter or - UnicodeCategory.OtherLetter or - UnicodeCategory.NonSpacingMark or - UnicodeCategory.DecimalDigitNumber or - UnicodeCategory.ConnectorPunctuation => true, - _ => false, - }; + (WordCategories & (1 << (int)CharUnicodeInfo.GetUnicodeCategory(ch))) != 0; } /// Pushes 2 values onto the backtracking stack. -- 2.7.4