Use char.GetUnicodeCategory in RegexCompiler for single Unicode categories (#31881)
authorStephen Toub <stoub@microsoft.com>
Fri, 7 Feb 2020 04:51:15 +0000 (23:51 -0500)
committerGitHub <noreply@github.com>
Fri, 7 Feb 2020 04:51:15 +0000 (23:51 -0500)
src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs
src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs
src/libraries/System.Text.RegularExpressions/tests/RegexCharacterSetTests.cs

index 37e1d43..f64024f 100644 (file)
@@ -744,6 +744,44 @@ namespace System.Text.RegularExpressions
             !IsSubtraction(set) &&
             (set[SetStartIndex] == LastChar || set[SetStartIndex] + 1 == set[SetStartIndex + 1]);
 
+        /// <summary>Gets whether the set contains nothing other than a single UnicodeCategory (it may be negated).</summary>
+        /// <param name="set">The set to examine.</param>
+        /// <param name="category">The single category if there was one.</param>
+        /// <param name="negated">true if the single category is a not match.</param>
+        /// <returns>true if a single category could be obtained; otherwise, false.</returns>
+        public static bool TryGetSingleUnicodeCategory(string set, out UnicodeCategory category, out bool negated)
+        {
+            if (set[CategoryLengthIndex] == 1 &&
+                set[SetLengthIndex] == 0 &&
+                !IsSubtraction(set))
+            {
+                short c = (short)set[SetStartIndex];
+
+                if (c > 0)
+                {
+                    if (c != SpaceConst)
+                    {
+                        category = (UnicodeCategory)(c - 1);
+                        negated = IsNegated(set);
+                        return true;
+                    }
+                }
+                else if (c < 0)
+                {
+                    if (c != NotSpaceConst)
+                    {
+                        category = (UnicodeCategory)(-1 - c);
+                        negated = !IsNegated(set);
+                        return true;
+                    }
+                }
+            }
+
+            category = default;
+            negated = false;
+            return false;
+        }
+
         /// <summary>Attempts to get a single range stored in the set.</summary>
         /// <param name="set">The set.</param>
         /// <param name="lowInclusive">The inclusive lower-bound of the range, if available.</param>
index c615dff..325187c 100644 (file)
@@ -2,7 +2,6 @@
 // The .NET Foundation licenses this file to you under the MIT license.
 // See the LICENSE file in the project root for more information.
 
-using System.Collections.Concurrent;
 using System.Collections.Generic;
 using System.Diagnostics;
 using System.Diagnostics.CodeAnalysis;
@@ -49,6 +48,7 @@ namespace System.Text.RegularExpressions
 
         private static readonly MethodInfo s_charIsDigitMethod = typeof(char).GetMethod("IsDigit", new Type[] { typeof(char) })!;
         private static readonly MethodInfo s_charIsWhiteSpaceMethod = typeof(char).GetMethod("IsWhiteSpace", new Type[] { typeof(char) })!;
+        private static readonly MethodInfo s_charGetUnicodeInfo = typeof(char).GetMethod("GetUnicodeCategory", new Type[] { typeof(char) })!;
         private static readonly MethodInfo s_charToLowerMethod = typeof(char).GetMethod("ToLower", new Type[] { typeof(char), typeof(CultureInfo) })!;
         private static readonly MethodInfo s_charToLowerInvariantMethod = typeof(char).GetMethod("ToLowerInvariant", new Type[] { typeof(char) })!;
         private static readonly MethodInfo s_cultureInfoGetCurrentCultureMethod = typeof(CultureInfo).GetMethod("get_CurrentCulture")!;
@@ -4760,6 +4760,24 @@ namespace System.Text.RegularExpressions
                 return;
             }
 
+            // Next if the character class contains nothing but a single Unicode category, we can calle char.GetUnicodeCategory and
+            // compare against it.  It has a fast-lookup path for ASCII, so is as good or better than any lookup we'd generate (plus
+            // we get smaller code), and it's what we'd do for the fallback (which we get to avoid generating) as part of CharInClass.
+            if (!invariant && RegexCharClass.TryGetSingleUnicodeCategory(charClass, out UnicodeCategory category, out bool negated))
+            {
+                // char.GetUnicodeInfo(ch) == category
+                Call(s_charGetUnicodeInfo);
+                Ldc((int)category);
+                Ceq();
+                if (negated)
+                {
+                    Ldc(0);
+                    Ceq();
+                }
+
+                return;
+            }
+
             // All checks after this point require reading the input character multiple times,
             // so we store it into a temporary local.
             Stloc(tempLocal);
index 1b1c90b..a13484f 100644 (file)
@@ -4,6 +4,7 @@
 
 using System.Collections.Generic;
 using System.Globalization;
+using System.Linq;
 using Xunit;
 using Xunit.Sdk;
 
@@ -294,6 +295,54 @@ namespace System.Text.RegularExpressions.Tests
             ValidateSet($"[^{set}]", RegexOptions.None, null, included);
         }
 
+        [Theory]
+        [InlineData("Cc", UnicodeCategory.Control)]
+        [InlineData("Cf", UnicodeCategory.Format)]
+        [InlineData("Cn", UnicodeCategory.OtherNotAssigned)]
+        [InlineData("Co", UnicodeCategory.PrivateUse)]
+        [InlineData("Cs", UnicodeCategory.Surrogate)]
+        [InlineData("Ll", UnicodeCategory.LowercaseLetter)]
+        [InlineData("Lm", UnicodeCategory.ModifierLetter)]
+        [InlineData("Lo", UnicodeCategory.OtherLetter)]
+        [InlineData("Lt", UnicodeCategory.TitlecaseLetter)]
+        [InlineData("Lu", UnicodeCategory.UppercaseLetter)]
+        [InlineData("Mc", UnicodeCategory.SpacingCombiningMark)]
+        [InlineData("Me", UnicodeCategory.EnclosingMark)]
+        [InlineData("Mn", UnicodeCategory.NonSpacingMark)]
+        [InlineData("Nd", UnicodeCategory.DecimalDigitNumber)]
+        [InlineData("Nl", UnicodeCategory.LetterNumber)]
+        [InlineData("No", UnicodeCategory.OtherNumber)]
+        [InlineData("Pc", UnicodeCategory.ConnectorPunctuation)]
+        [InlineData("Pd", UnicodeCategory.DashPunctuation)]
+        [InlineData("Pe", UnicodeCategory.ClosePunctuation)]
+        [InlineData("Po", UnicodeCategory.OtherPunctuation)]
+        [InlineData("Ps", UnicodeCategory.OpenPunctuation)]
+        [InlineData("Pf", UnicodeCategory.FinalQuotePunctuation)]
+        [InlineData("Pi", UnicodeCategory.InitialQuotePunctuation)]
+        [InlineData("Sc", UnicodeCategory.CurrencySymbol)]
+        [InlineData("Sk", UnicodeCategory.ModifierSymbol)]
+        [InlineData("Sm", UnicodeCategory.MathSymbol)]
+        [InlineData("So", UnicodeCategory.OtherSymbol)]
+        [InlineData("Zl", UnicodeCategory.LineSeparator)]
+        [InlineData("Zp", UnicodeCategory.ParagraphSeparator)]
+        [InlineData("Zs", UnicodeCategory.SpaceSeparator)]
+        public void UnicodeCategoriesInclusionsExpected(string generalCategory, UnicodeCategory unicodeCategory)
+        {
+            foreach (RegexOptions options in new[] { RegexOptions.None, RegexOptions.Compiled })
+            {
+                Regex r;
+                char[] allChars = Enumerable.Range(0, char.MaxValue + 1).Select(i => (char)i).ToArray();
+                int expectedInCategory = allChars.Count(c => char.GetUnicodeCategory(c) == unicodeCategory);
+                int expectedNotInCategory = allChars.Length - expectedInCategory;
+
+                r = new Regex(@$"\p{{{generalCategory}}}");
+                Assert.Equal(expectedInCategory, r.Matches(string.Concat(allChars)).Count);
+
+                r = new Regex(@$"\P{{{generalCategory}}}");
+                Assert.Equal(expectedNotInCategory, r.Matches(string.Concat(allChars)).Count);
+            }
+        }
+
         private static HashSet<char> ComputeIncludedSet(Func<char, bool> func)
         {
             var included = new HashSet<char>();