From 0c8b166f0d178f5f24d7ad525e0bee85db60985b Mon Sep 17 00:00:00 2001 From: Stephen Toub Date: Thu, 6 Feb 2020 23:51:15 -0500 Subject: [PATCH] Use char.GetUnicodeCategory in RegexCompiler for single Unicode categories (#31881) --- .../Text/RegularExpressions/RegexCharClass.cs | 38 +++++++++++++++++ .../Text/RegularExpressions/RegexCompiler.cs | 20 ++++++++- .../tests/RegexCharacterSetTests.cs | 49 ++++++++++++++++++++++ 3 files changed, 106 insertions(+), 1 deletion(-) diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs index 37e1d43..f64024f 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs @@ -744,6 +744,44 @@ namespace System.Text.RegularExpressions !IsSubtraction(set) && (set[SetStartIndex] == LastChar || set[SetStartIndex] + 1 == set[SetStartIndex + 1]); + /// Gets whether the set contains nothing other than a single UnicodeCategory (it may be negated). + /// The set to examine. + /// The single category if there was one. + /// true if the single category is a not match. + /// true if a single category could be obtained; otherwise, false. + public static bool TryGetSingleUnicodeCategory(string set, out UnicodeCategory category, out bool negated) + { + if (set[CategoryLengthIndex] == 1 && + set[SetLengthIndex] == 0 && + !IsSubtraction(set)) + { + short c = (short)set[SetStartIndex]; + + if (c > 0) + { + if (c != SpaceConst) + { + category = (UnicodeCategory)(c - 1); + negated = IsNegated(set); + return true; + } + } + else if (c < 0) + { + if (c != NotSpaceConst) + { + category = (UnicodeCategory)(-1 - c); + negated = !IsNegated(set); + return true; + } + } + } + + category = default; + negated = false; + return false; + } + /// Attempts to get a single range stored in the set. /// The set. /// The inclusive lower-bound of the range, if available. diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs index c615dff..325187c 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs @@ -2,7 +2,6 @@ // The .NET Foundation licenses this file to you under the MIT license. // See the LICENSE file in the project root for more information. -using System.Collections.Concurrent; using System.Collections.Generic; using System.Diagnostics; using System.Diagnostics.CodeAnalysis; @@ -49,6 +48,7 @@ namespace System.Text.RegularExpressions private static readonly MethodInfo s_charIsDigitMethod = typeof(char).GetMethod("IsDigit", new Type[] { typeof(char) })!; private static readonly MethodInfo s_charIsWhiteSpaceMethod = typeof(char).GetMethod("IsWhiteSpace", new Type[] { typeof(char) })!; + private static readonly MethodInfo s_charGetUnicodeInfo = typeof(char).GetMethod("GetUnicodeCategory", new Type[] { typeof(char) })!; private static readonly MethodInfo s_charToLowerMethod = typeof(char).GetMethod("ToLower", new Type[] { typeof(char), typeof(CultureInfo) })!; private static readonly MethodInfo s_charToLowerInvariantMethod = typeof(char).GetMethod("ToLowerInvariant", new Type[] { typeof(char) })!; private static readonly MethodInfo s_cultureInfoGetCurrentCultureMethod = typeof(CultureInfo).GetMethod("get_CurrentCulture")!; @@ -4760,6 +4760,24 @@ namespace System.Text.RegularExpressions return; } + // Next if the character class contains nothing but a single Unicode category, we can calle char.GetUnicodeCategory and + // compare against it. It has a fast-lookup path for ASCII, so is as good or better than any lookup we'd generate (plus + // we get smaller code), and it's what we'd do for the fallback (which we get to avoid generating) as part of CharInClass. + if (!invariant && RegexCharClass.TryGetSingleUnicodeCategory(charClass, out UnicodeCategory category, out bool negated)) + { + // char.GetUnicodeInfo(ch) == category + Call(s_charGetUnicodeInfo); + Ldc((int)category); + Ceq(); + if (negated) + { + Ldc(0); + Ceq(); + } + + return; + } + // All checks after this point require reading the input character multiple times, // so we store it into a temporary local. Stloc(tempLocal); diff --git a/src/libraries/System.Text.RegularExpressions/tests/RegexCharacterSetTests.cs b/src/libraries/System.Text.RegularExpressions/tests/RegexCharacterSetTests.cs index 1b1c90b..a13484f 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/RegexCharacterSetTests.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/RegexCharacterSetTests.cs @@ -4,6 +4,7 @@ using System.Collections.Generic; using System.Globalization; +using System.Linq; using Xunit; using Xunit.Sdk; @@ -294,6 +295,54 @@ namespace System.Text.RegularExpressions.Tests ValidateSet($"[^{set}]", RegexOptions.None, null, included); } + [Theory] + [InlineData("Cc", UnicodeCategory.Control)] + [InlineData("Cf", UnicodeCategory.Format)] + [InlineData("Cn", UnicodeCategory.OtherNotAssigned)] + [InlineData("Co", UnicodeCategory.PrivateUse)] + [InlineData("Cs", UnicodeCategory.Surrogate)] + [InlineData("Ll", UnicodeCategory.LowercaseLetter)] + [InlineData("Lm", UnicodeCategory.ModifierLetter)] + [InlineData("Lo", UnicodeCategory.OtherLetter)] + [InlineData("Lt", UnicodeCategory.TitlecaseLetter)] + [InlineData("Lu", UnicodeCategory.UppercaseLetter)] + [InlineData("Mc", UnicodeCategory.SpacingCombiningMark)] + [InlineData("Me", UnicodeCategory.EnclosingMark)] + [InlineData("Mn", UnicodeCategory.NonSpacingMark)] + [InlineData("Nd", UnicodeCategory.DecimalDigitNumber)] + [InlineData("Nl", UnicodeCategory.LetterNumber)] + [InlineData("No", UnicodeCategory.OtherNumber)] + [InlineData("Pc", UnicodeCategory.ConnectorPunctuation)] + [InlineData("Pd", UnicodeCategory.DashPunctuation)] + [InlineData("Pe", UnicodeCategory.ClosePunctuation)] + [InlineData("Po", UnicodeCategory.OtherPunctuation)] + [InlineData("Ps", UnicodeCategory.OpenPunctuation)] + [InlineData("Pf", UnicodeCategory.FinalQuotePunctuation)] + [InlineData("Pi", UnicodeCategory.InitialQuotePunctuation)] + [InlineData("Sc", UnicodeCategory.CurrencySymbol)] + [InlineData("Sk", UnicodeCategory.ModifierSymbol)] + [InlineData("Sm", UnicodeCategory.MathSymbol)] + [InlineData("So", UnicodeCategory.OtherSymbol)] + [InlineData("Zl", UnicodeCategory.LineSeparator)] + [InlineData("Zp", UnicodeCategory.ParagraphSeparator)] + [InlineData("Zs", UnicodeCategory.SpaceSeparator)] + public void UnicodeCategoriesInclusionsExpected(string generalCategory, UnicodeCategory unicodeCategory) + { + foreach (RegexOptions options in new[] { RegexOptions.None, RegexOptions.Compiled }) + { + Regex r; + char[] allChars = Enumerable.Range(0, char.MaxValue + 1).Select(i => (char)i).ToArray(); + int expectedInCategory = allChars.Count(c => char.GetUnicodeCategory(c) == unicodeCategory); + int expectedNotInCategory = allChars.Length - expectedInCategory; + + r = new Regex(@$"\p{{{generalCategory}}}"); + Assert.Equal(expectedInCategory, r.Matches(string.Concat(allChars)).Count); + + r = new Regex(@$"\P{{{generalCategory}}}"); + Assert.Equal(expectedNotInCategory, r.Matches(string.Concat(allChars)).Count); + } + } + private static HashSet ComputeIncludedSet(Func func) { var included = new HashSet(); -- 2.7.4