!IsSubtraction(set) &&
(set[SetStartIndex] == LastChar || set[SetStartIndex] + 1 == set[SetStartIndex + 1]);
+ /// <summary>Gets whether the set contains nothing other than a single UnicodeCategory (it may be negated).</summary>
+ /// <param name="set">The set to examine.</param>
+ /// <param name="category">The single category if there was one.</param>
+ /// <param name="negated">true if the single category is a not match.</param>
+ /// <returns>true if a single category could be obtained; otherwise, false.</returns>
+ public static bool TryGetSingleUnicodeCategory(string set, out UnicodeCategory category, out bool negated)
+ {
+ if (set[CategoryLengthIndex] == 1 &&
+ set[SetLengthIndex] == 0 &&
+ !IsSubtraction(set))
+ {
+ short c = (short)set[SetStartIndex];
+
+ if (c > 0)
+ {
+ if (c != SpaceConst)
+ {
+ category = (UnicodeCategory)(c - 1);
+ negated = IsNegated(set);
+ return true;
+ }
+ }
+ else if (c < 0)
+ {
+ if (c != NotSpaceConst)
+ {
+ category = (UnicodeCategory)(-1 - c);
+ negated = !IsNegated(set);
+ return true;
+ }
+ }
+ }
+
+ category = default;
+ negated = false;
+ return false;
+ }
+
/// <summary>Attempts to get a single range stored in the set.</summary>
/// <param name="set">The set.</param>
/// <param name="lowInclusive">The inclusive lower-bound of the range, if available.</param>
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.
-using System.Collections.Concurrent;
using System.Collections.Generic;
using System.Diagnostics;
using System.Diagnostics.CodeAnalysis;
private static readonly MethodInfo s_charIsDigitMethod = typeof(char).GetMethod("IsDigit", new Type[] { typeof(char) })!;
private static readonly MethodInfo s_charIsWhiteSpaceMethod = typeof(char).GetMethod("IsWhiteSpace", new Type[] { typeof(char) })!;
+ private static readonly MethodInfo s_charGetUnicodeInfo = typeof(char).GetMethod("GetUnicodeCategory", new Type[] { typeof(char) })!;
private static readonly MethodInfo s_charToLowerMethod = typeof(char).GetMethod("ToLower", new Type[] { typeof(char), typeof(CultureInfo) })!;
private static readonly MethodInfo s_charToLowerInvariantMethod = typeof(char).GetMethod("ToLowerInvariant", new Type[] { typeof(char) })!;
private static readonly MethodInfo s_cultureInfoGetCurrentCultureMethod = typeof(CultureInfo).GetMethod("get_CurrentCulture")!;
return;
}
+ // Next if the character class contains nothing but a single Unicode category, we can calle char.GetUnicodeCategory and
+ // compare against it. It has a fast-lookup path for ASCII, so is as good or better than any lookup we'd generate (plus
+ // we get smaller code), and it's what we'd do for the fallback (which we get to avoid generating) as part of CharInClass.
+ if (!invariant && RegexCharClass.TryGetSingleUnicodeCategory(charClass, out UnicodeCategory category, out bool negated))
+ {
+ // char.GetUnicodeInfo(ch) == category
+ Call(s_charGetUnicodeInfo);
+ Ldc((int)category);
+ Ceq();
+ if (negated)
+ {
+ Ldc(0);
+ Ceq();
+ }
+
+ return;
+ }
+
// All checks after this point require reading the input character multiple times,
// so we store it into a temporary local.
Stloc(tempLocal);
using System.Collections.Generic;
using System.Globalization;
+using System.Linq;
using Xunit;
using Xunit.Sdk;
ValidateSet($"[^{set}]", RegexOptions.None, null, included);
}
+ [Theory]
+ [InlineData("Cc", UnicodeCategory.Control)]
+ [InlineData("Cf", UnicodeCategory.Format)]
+ [InlineData("Cn", UnicodeCategory.OtherNotAssigned)]
+ [InlineData("Co", UnicodeCategory.PrivateUse)]
+ [InlineData("Cs", UnicodeCategory.Surrogate)]
+ [InlineData("Ll", UnicodeCategory.LowercaseLetter)]
+ [InlineData("Lm", UnicodeCategory.ModifierLetter)]
+ [InlineData("Lo", UnicodeCategory.OtherLetter)]
+ [InlineData("Lt", UnicodeCategory.TitlecaseLetter)]
+ [InlineData("Lu", UnicodeCategory.UppercaseLetter)]
+ [InlineData("Mc", UnicodeCategory.SpacingCombiningMark)]
+ [InlineData("Me", UnicodeCategory.EnclosingMark)]
+ [InlineData("Mn", UnicodeCategory.NonSpacingMark)]
+ [InlineData("Nd", UnicodeCategory.DecimalDigitNumber)]
+ [InlineData("Nl", UnicodeCategory.LetterNumber)]
+ [InlineData("No", UnicodeCategory.OtherNumber)]
+ [InlineData("Pc", UnicodeCategory.ConnectorPunctuation)]
+ [InlineData("Pd", UnicodeCategory.DashPunctuation)]
+ [InlineData("Pe", UnicodeCategory.ClosePunctuation)]
+ [InlineData("Po", UnicodeCategory.OtherPunctuation)]
+ [InlineData("Ps", UnicodeCategory.OpenPunctuation)]
+ [InlineData("Pf", UnicodeCategory.FinalQuotePunctuation)]
+ [InlineData("Pi", UnicodeCategory.InitialQuotePunctuation)]
+ [InlineData("Sc", UnicodeCategory.CurrencySymbol)]
+ [InlineData("Sk", UnicodeCategory.ModifierSymbol)]
+ [InlineData("Sm", UnicodeCategory.MathSymbol)]
+ [InlineData("So", UnicodeCategory.OtherSymbol)]
+ [InlineData("Zl", UnicodeCategory.LineSeparator)]
+ [InlineData("Zp", UnicodeCategory.ParagraphSeparator)]
+ [InlineData("Zs", UnicodeCategory.SpaceSeparator)]
+ public void UnicodeCategoriesInclusionsExpected(string generalCategory, UnicodeCategory unicodeCategory)
+ {
+ foreach (RegexOptions options in new[] { RegexOptions.None, RegexOptions.Compiled })
+ {
+ Regex r;
+ char[] allChars = Enumerable.Range(0, char.MaxValue + 1).Select(i => (char)i).ToArray();
+ int expectedInCategory = allChars.Count(c => char.GetUnicodeCategory(c) == unicodeCategory);
+ int expectedNotInCategory = allChars.Length - expectedInCategory;
+
+ r = new Regex(@$"\p{{{generalCategory}}}");
+ Assert.Equal(expectedInCategory, r.Matches(string.Concat(allChars)).Count);
+
+ r = new Regex(@$"\P{{{generalCategory}}}");
+ Assert.Equal(expectedNotInCategory, r.Matches(string.Concat(allChars)).Count);
+ }
+ }
+
private static HashSet<char> ComputeIncludedSet(Func<char, bool> func)
{
var included = new HashSet<char>();