Use char.GetUnicodeCategory in RegexCompiler for single Unicode categories (#31881)

author Stephen Toub <stoub@microsoft.com>

Fri, 7 Feb 2020 04:51:15 +0000 (23:51 -0500)

committer GitHub <noreply@github.com>

Fri, 7 Feb 2020 04:51:15 +0000 (23:51 -0500)
author Stephen Toub <stoub@microsoft.com>
Fri, 7 Feb 2020 04:51:15 +0000 (23:51 -0500)
committer GitHub <noreply@github.com>
Fri, 7 Feb 2020 04:51:15 +0000 (23:51 -0500)
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs

index 37e1d43..f64024f 100644 (file)
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs
@@ -744,6 +744,44 @@ namespace System.Text.RegularExpressions
              !IsSubtraction(set) &&
              (set[SetStartIndex] == LastChar || set[SetStartIndex] + 1 == set[SetStartIndex + 1]);
  
+        /// <summary>Gets whether the set contains nothing other than a single UnicodeCategory (it may be negated).</summary>
+        /// <param name="set">The set to examine.</param>
+        /// <param name="category">The single category if there was one.</param>
+        /// <param name="negated">true if the single category is a not match.</param>
+        /// <returns>true if a single category could be obtained; otherwise, false.</returns>
+        public static bool TryGetSingleUnicodeCategory(string set, out UnicodeCategory category, out bool negated)
+        {
+            if (set[CategoryLengthIndex] == 1 &&
+                set[SetLengthIndex] == 0 &&
+                !IsSubtraction(set))
+            {
+                short c = (short)set[SetStartIndex];
+
+                if (c > 0)
+                {
+                    if (c != SpaceConst)
+                    {
+                        category = (UnicodeCategory)(c - 1);
+                        negated = IsNegated(set);
+                        return true;
+                    }
+                }
+                else if (c < 0)
+                {
+                    if (c != NotSpaceConst)
+                    {
+                        category = (UnicodeCategory)(-1 - c);
+                        negated = !IsNegated(set);
+                        return true;
+                    }
+                }
+            }
+
+            category = default;
+            negated = false;
+            return false;
+        }
+
          /// <summary>Attempts to get a single range stored in the set.</summary>
          /// <param name="set">The set.</param>
          /// <param name="lowInclusive">The inclusive lower-bound of the range, if available.</param>
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs

index c615dff..325187c 100644 (file)
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs
@@ -2,7 +2,6 @@
  // The .NET Foundation licenses this file to you under the MIT license.
  // See the LICENSE file in the project root for more information.
  
-using System.Collections.Concurrent;
  using System.Collections.Generic;
  using System.Diagnostics;
  using System.Diagnostics.CodeAnalysis;
@@ -49,6 +48,7 @@ namespace System.Text.RegularExpressions
  
          private static readonly MethodInfo s_charIsDigitMethod = typeof(char).GetMethod("IsDigit", new Type[] { typeof(char) })!;
          private static readonly MethodInfo s_charIsWhiteSpaceMethod = typeof(char).GetMethod("IsWhiteSpace", new Type[] { typeof(char) })!;
+        private static readonly MethodInfo s_charGetUnicodeInfo = typeof(char).GetMethod("GetUnicodeCategory", new Type[] { typeof(char) })!;
          private static readonly MethodInfo s_charToLowerMethod = typeof(char).GetMethod("ToLower", new Type[] { typeof(char), typeof(CultureInfo) })!;
          private static readonly MethodInfo s_charToLowerInvariantMethod = typeof(char).GetMethod("ToLowerInvariant", new Type[] { typeof(char) })!;
          private static readonly MethodInfo s_cultureInfoGetCurrentCultureMethod = typeof(CultureInfo).GetMethod("get_CurrentCulture")!;
@@ -4760,6 +4760,24 @@ namespace System.Text.RegularExpressions
                  return;
              }
  
+            // Next if the character class contains nothing but a single Unicode category, we can calle char.GetUnicodeCategory and
+            // compare against it.  It has a fast-lookup path for ASCII, so is as good or better than any lookup we'd generate (plus
+            // we get smaller code), and it's what we'd do for the fallback (which we get to avoid generating) as part of CharInClass.
+            if (!invariant && RegexCharClass.TryGetSingleUnicodeCategory(charClass, out UnicodeCategory category, out bool negated))
+            {
+                // char.GetUnicodeInfo(ch) == category
+                Call(s_charGetUnicodeInfo);
+                Ldc((int)category);
+                Ceq();
+                if (negated)
+                {
+                    Ldc(0);
+                    Ceq();
+                }
+
+                return;
+            }
+
              // All checks after this point require reading the input character multiple times,
              // so we store it into a temporary local.
              Stloc(tempLocal);
diff --git a/src/libraries/System.Text.RegularExpressions/tests/RegexCharacterSetTests.cs b/src/libraries/System.Text.RegularExpressions/tests/RegexCharacterSetTests.cs

index 1b1c90b..a13484f 100644 (file)
--- a/src/libraries/System.Text.RegularExpressions/tests/RegexCharacterSetTests.cs
+++ b/src/libraries/System.Text.RegularExpressions/tests/RegexCharacterSetTests.cs
@@ -4,6 +4,7 @@
  
  using System.Collections.Generic;
  using System.Globalization;
+using System.Linq;
  using Xunit;
  using Xunit.Sdk;
  
@@ -294,6 +295,54 @@ namespace System.Text.RegularExpressions.Tests
              ValidateSet($"[^{set}]", RegexOptions.None, null, included);
          }
  
+        [Theory]
+        [InlineData("Cc", UnicodeCategory.Control)]
+        [InlineData("Cf", UnicodeCategory.Format)]
+        [InlineData("Cn", UnicodeCategory.OtherNotAssigned)]
+        [InlineData("Co", UnicodeCategory.PrivateUse)]
+        [InlineData("Cs", UnicodeCategory.Surrogate)]
+        [InlineData("Ll", UnicodeCategory.LowercaseLetter)]
+        [InlineData("Lm", UnicodeCategory.ModifierLetter)]
+        [InlineData("Lo", UnicodeCategory.OtherLetter)]
+        [InlineData("Lt", UnicodeCategory.TitlecaseLetter)]
+        [InlineData("Lu", UnicodeCategory.UppercaseLetter)]
+        [InlineData("Mc", UnicodeCategory.SpacingCombiningMark)]
+        [InlineData("Me", UnicodeCategory.EnclosingMark)]
+        [InlineData("Mn", UnicodeCategory.NonSpacingMark)]
+        [InlineData("Nd", UnicodeCategory.DecimalDigitNumber)]
+        [InlineData("Nl", UnicodeCategory.LetterNumber)]
+        [InlineData("No", UnicodeCategory.OtherNumber)]
+        [InlineData("Pc", UnicodeCategory.ConnectorPunctuation)]
+        [InlineData("Pd", UnicodeCategory.DashPunctuation)]
+        [InlineData("Pe", UnicodeCategory.ClosePunctuation)]
+        [InlineData("Po", UnicodeCategory.OtherPunctuation)]
+        [InlineData("Ps", UnicodeCategory.OpenPunctuation)]
+        [InlineData("Pf", UnicodeCategory.FinalQuotePunctuation)]
+        [InlineData("Pi", UnicodeCategory.InitialQuotePunctuation)]
+        [InlineData("Sc", UnicodeCategory.CurrencySymbol)]
+        [InlineData("Sk", UnicodeCategory.ModifierSymbol)]
+        [InlineData("Sm", UnicodeCategory.MathSymbol)]
+        [InlineData("So", UnicodeCategory.OtherSymbol)]
+        [InlineData("Zl", UnicodeCategory.LineSeparator)]
+        [InlineData("Zp", UnicodeCategory.ParagraphSeparator)]
+        [InlineData("Zs", UnicodeCategory.SpaceSeparator)]
+        public void UnicodeCategoriesInclusionsExpected(string generalCategory, UnicodeCategory unicodeCategory)
+        {
+            foreach (RegexOptions options in new[] { RegexOptions.None, RegexOptions.Compiled })
+            {
+                Regex r;
+                char[] allChars = Enumerable.Range(0, char.MaxValue + 1).Select(i => (char)i).ToArray();
+                int expectedInCategory = allChars.Count(c => char.GetUnicodeCategory(c) == unicodeCategory);
+                int expectedNotInCategory = allChars.Length - expectedInCategory;
+
+                r = new Regex(@$"\p{{{generalCategory}}}");
+                Assert.Equal(expectedInCategory, r.Matches(string.Concat(allChars)).Count);
+
+                r = new Regex(@$"\P{{{generalCategory}}}");
+                Assert.Equal(expectedNotInCategory, r.Matches(string.Concat(allChars)).Count);
+            }
+        }
+
          private static HashSet<char> ComputeIncludedSet(Func<char, bool> func)
          {
              var included = new HashSet<char>();
author	Stephen Toub <stoub@microsoft.com>
	Fri, 7 Feb 2020 04:51:15 +0000 (23:51 -0500)
committer	GitHub <noreply@github.com>
	Fri, 7 Feb 2020 04:51:15 +0000 (23:51 -0500)
src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs		patch \| blob \| history
src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs		patch \| blob \| history
src/libraries/System.Text.RegularExpressions/tests/RegexCharacterSetTests.cs		patch \| blob \| history