Earlier in .NET 8, we updated the Regex compiler and source generator to be able to vectorize a search for any set, not just simple ones. When one of the main routines couldn't be used, we emit a specialized IndexOfAny helper that uses SearchValues to search for any matching ASCII character or a Unicode character, and if it encounters a Unicode character, it falls back to a linear scan. This meant that a bunch of sets that wouldn't previously have taken these paths now do, but some of those sets have more efficient means of searching; for example, for the set `[^aA]` that searches case-insensitive for anything other than an 'A', with these scheme we'll emit a whole routine that uses SearchValues with a fallback, but we could just use IndexOfAnyExcept('A', 'a'). This fixes the compiler / source generator to prefer such helpers instead when available.
(true, _) => $"{span}.Slice(i + {primarySet.Distance})",
};
- Debug.Assert(!primarySet.Negated || (primarySet.Chars is null && primarySet.AsciiSet is null));
+ // Get the IndexOf* expression to use to perform the search.
+ string indexOf;
+ if (primarySet.Chars is not null)
+ {
+ // We have a chars array, so we can use IndexOf{Any}{Except} to search for it. Choose the best overload.
+ string indexOfName = "IndexOf", indexOfAnyName = "IndexOfAny";
+ if (primarySet.Negated)
+ {
+ indexOfName = indexOfAnyName = "IndexOfAnyExcept";
+ }
- string indexOf =
- primarySet.Chars is not null ? primarySet.Chars.Length switch
+ indexOf = primarySet.Chars.Length switch
{
- 1 => $"{span}.IndexOf({Literal(primarySet.Chars[0])})",
- 2 => $"{span}.IndexOfAny({Literal(primarySet.Chars[0])}, {Literal(primarySet.Chars[1])})",
- 3 => $"{span}.IndexOfAny({Literal(primarySet.Chars[0])}, {Literal(primarySet.Chars[1])}, {Literal(primarySet.Chars[2])})",
- _ => $"{span}.IndexOfAny({EmitSearchValuesOrLiteral(primarySet.Chars, requiredHelpers)})",
- } :
- primarySet.AsciiSet is not null ? $"{span}.IndexOfAny({EmitSearchValues(primarySet.AsciiSet, requiredHelpers)})" :
- primarySet.Range is not null ? (primarySet.Range.Value.LowInclusive == primarySet.Range.Value.HighInclusive, primarySet.Negated) switch
+ 1 => $"{span}.{indexOfName}({Literal(primarySet.Chars[0])})",
+ 2 => $"{span}.{indexOfAnyName}({Literal(primarySet.Chars[0])}, {Literal(primarySet.Chars[1])})",
+ 3 => $"{span}.{indexOfAnyName}({Literal(primarySet.Chars[0])}, {Literal(primarySet.Chars[1])}, {Literal(primarySet.Chars[2])})",
+ _ => $"{span}.{indexOfAnyName}({EmitSearchValuesOrLiteral(primarySet.Chars, requiredHelpers)})",
+ };
+ }
+ else if (primarySet.AsciiSet is not null)
+ {
+ // We have a set of ASCII chars, so we can use IndexOfAny(SearchValues) to search for it.
+ Debug.Assert(!primarySet.Negated);
+ indexOf = $"{span}.IndexOfAny({EmitSearchValues(primarySet.AsciiSet, requiredHelpers)})";
+ }
+ else if (primarySet.Range is not null)
+ {
+ // We have a range, so we can use IndexOfAny{Except}InRange to search for it. In the corner case,
+ // where we end up with a set of a single char, we can use IndexOf instead.
+ indexOf = (primarySet.Range.Value.LowInclusive == primarySet.Range.Value.HighInclusive, primarySet.Negated) switch
{
(false, false) => $"{span}.IndexOfAnyInRange({Literal(primarySet.Range.Value.LowInclusive)}, {Literal(primarySet.Range.Value.HighInclusive)})",
(true, false) => $"{span}.IndexOf({Literal(primarySet.Range.Value.LowInclusive)})",
(false, true) => $"{span}.IndexOfAnyExceptInRange({Literal(primarySet.Range.Value.LowInclusive)}, {Literal(primarySet.Range.Value.HighInclusive)})",
(true, true) => $"{span}.IndexOfAnyExcept({Literal(primarySet.Range.Value.LowInclusive)})",
- } :
- $"{span}.{EmitIndexOfAnyCustomHelper(primarySet.Set, requiredHelpers, checkOverflow)}()";
+ };
+ }
+ else
+ {
+ // We have an arbitrary set of characters that includes at least one non-ASCII char. We use a custom IndexOfAny helper that
+ // will perform the search as efficiently as possible.
+ indexOf = $"{span}.{EmitIndexOfAnyCustomHelper(primarySet.Set, requiredHelpers, checkOverflow)}()";
+ }
if (needLoop)
{
if (set.Chars is { Length: 1 })
{
+ Debug.Assert(!set.Negated);
writer.WriteLine($"pos = inputSpan.Slice(0, pos).LastIndexOf({Literal(set.Chars[0])});");
using (EmitBlock(writer, "if (pos >= 0)"))
{
{
if (iterationCount is null &&
node.Kind is RegexNodeKind.Notonelazy &&
- subsequent?.FindStartingLiteral(4) is RegexNode.StartingLiteralData literal && // 5 == max optimized by IndexOfAny, and we need to reserve 1 for node.Ch
+ subsequent?.FindStartingLiteral(4) is RegexNode.StartingLiteralData literal && // 5 == max efficiently optimized by IndexOfAny, and we need to reserve 1 for node.Ch
!literal.Negated && // not negated; can't search for both the node.Ch and a negated subsequent char with an IndexOf* method
(literal.String is not null ||
literal.SetChars is not null ||
Ldloc(textSpanLocal);
}
- Debug.Assert(!primarySet.Negated || (primarySet.Chars is null && primarySet.AsciiSet is null));
-
if (primarySet.Chars is not null)
{
switch (primarySet.Chars.Length)
case 1:
// tmp = ...IndexOf(setChars[0]);
Ldc(primarySet.Chars[0]);
- Call(s_spanIndexOfChar);
+ Call(primarySet.Negated ? s_spanIndexOfAnyExceptChar : s_spanIndexOfChar);
break;
case 2:
// tmp = ...IndexOfAny(setChars[0], setChars[1]);
Ldc(primarySet.Chars[0]);
Ldc(primarySet.Chars[1]);
- Call(s_spanIndexOfAnyCharChar);
+ Call(primarySet.Negated ? s_spanIndexOfAnyExceptCharChar : s_spanIndexOfAnyCharChar);
break;
case 3:
Ldc(primarySet.Chars[0]);
Ldc(primarySet.Chars[1]);
Ldc(primarySet.Chars[2]);
- Call(s_spanIndexOfAnyCharCharChar);
+ Call(primarySet.Negated ? s_spanIndexOfAnyExceptCharCharChar : s_spanIndexOfAnyCharCharChar);
break;
default:
Ldstr(new string(primarySet.Chars));
Call(s_stringAsSpanMethod);
- Call(s_spanIndexOfAnySpan);
+ Call(primarySet.Negated ? s_spanIndexOfAnyExceptSpan : s_spanIndexOfAnySpan);
break;
}
}
if (set.Chars is { Length: 1 })
{
+ Debug.Assert(!set.Negated);
+
// pos = inputSpan.Slice(0, pos).LastIndexOf(set.Chars[0]);
Ldloca(inputSpan);
Ldc(0);
using System.Collections.Generic;
using System.Diagnostics;
-using System.Globalization;
namespace System.Text.RegularExpressions
{
if (RegexPrefixAnalyzer.FindFirstCharClass(root) is string charClass)
{
// See if the set is limited to holding only a few characters.
- Span<char> scratch = stackalloc char[5]; // max optimized by IndexOfAny today
+ Span<char> scratch = stackalloc char[5]; // max efficiently optimized by IndexOfAny today
int scratchCount;
char[]? chars = null;
if (!RegexCharClass.IsNegated(charClass) &&
{
// The set contains one and only one character, meaning every match starts
// with the same literal value (potentially case-insensitive). Search for that.
+ Debug.Assert(!RegexCharClass.IsNegated(charClass));
FixedDistanceLiteral = (chars[0], null, 0);
FindMode = FindNextStartingPositionMode.LeadingChar_RightToLeft;
}
else
{
// The set may match multiple characters. Search for that.
+ Debug.Assert(!RegexCharClass.IsNegated(charClass) || chars is null);
FixedDistanceSets = new List<FixedDistanceSet>()
{
new FixedDistanceSet(chars, charClass, 0)
// As a backup, see if we can find a literal after a leading atomic loop. That might be better than whatever sets we find, so
// we want to know whether we have one in our pocket before deciding whether to use a leading set (we'll prefer a leading
- // set if it's something for which we can vectorize a search).
+ // set if it's something for which we can search efficiently).
(RegexNode LoopNode, (char Char, string? String, char[]? Chars) Literal)? literalAfterLoop = RegexPrefixAnalyzer.FindLiteralFollowingLeadingLoop(root);
- // If we got such sets, we'll likely use them. However, if the best of them is something that doesn't support a vectorized
- // search and we did successfully find a literal after an atomic loop we could search instead, we prefer the vectorizable search.
+ // If we got such sets, we'll likely use them. However, if the best of them is something that doesn't support an efficient
+ // search and we did successfully find a literal after an atomic loop we could search instead, we prefer the efficient search.
+ // For example, if we have a negated set, we will still prefer the literal-after-an-atomic-loop because negated sets typically
+ // contain _many_ characters (e.g. [^a] is everything but 'a') and are thus more likely to very quickly match, which means any
+ // vectorization employed is less likely to kick in and be worth the startup overhead.
if (fixedDistanceSets is not null)
{
+ // Sort the sets by "quality", such that whatever set is first is the one deemed most efficient to use.
+ // In some searches, we may use multiple sets, so we want the subsequent ones to also be the efficiency runners-up.
RegexPrefixAnalyzer.SortFixedDistanceSetsByQuality(fixedDistanceSets);
- if (fixedDistanceSets[0].Chars is not null || literalAfterLoop is null)
+
+ // If there is no literal after the loop, use whatever set we got.
+ // If there is a literal after the loop, consider it to be better than a negated set and better than a set with many characters.
+ if (literalAfterLoop is null ||
+ (fixedDistanceSets[0].Chars is not null && !fixedDistanceSets[0].Negated))
{
// Determine whether to do searching based on one or more sets or on a single literal. Compiled engines
// don't need to special-case literals as they already do codegen to create the optimal lookup based on
// the set's characteristics.
if (!compiled &&
fixedDistanceSets.Count == 1 &&
- fixedDistanceSets[0].Chars is { Length: 1 })
+ fixedDistanceSets[0].Chars is { Length: 1 } &&
+ !fixedDistanceSets[0].Negated)
{
FixedDistanceLiteral = (fixedDistanceSets[0].Chars![0], null, fixedDistanceSets[0].Distance);
FindMode = FindNextStartingPositionMode.FixedDistanceChar_LeftToRight;
// Store the sets, and compute which mode to use.
FixedDistanceSets = fixedDistanceSets;
- FindMode = (fixedDistanceSets.Count == 1 && fixedDistanceSets[0].Distance == 0) ? FindNextStartingPositionMode.LeadingSet_LeftToRight
- : FindNextStartingPositionMode.FixedDistanceSets_LeftToRight;
+ FindMode = (fixedDistanceSets.Count == 1 && fixedDistanceSets[0].Distance == 0) ?
+ FindNextStartingPositionMode.LeadingSet_LeftToRight :
+ FindNextStartingPositionMode.FixedDistanceSets_LeftToRight;
_asciiLookups = new uint[fixedDistanceSets.Count][];
}
return;
return best;
}
+#if SYSTEM_TEXT_REGULAREXPRESSIONS
/// <summary>Try to advance to the next starting position that might be a location for a match.</summary>
/// <param name="textSpan">The text to search.</param>
/// <param name="pos">The position in <paramref name="textSpan"/>. This is updated with the found position.</param>
{
FixedDistanceSet primarySet = FixedDistanceSets![0];
char[]? chars = primarySet.Chars;
- string set = primarySet.Set;
ReadOnlySpan<char> span = textSpan.Slice(pos);
if (chars is not null)
{
- int i = span.IndexOfAny(chars);
+ int i = primarySet.Negated ? span.IndexOfAnyExcept(chars) : span.IndexOfAny(chars);
if (i >= 0)
{
pos += i;
ref uint[]? startingAsciiLookup = ref _asciiLookups![0];
for (int i = 0; i < span.Length; i++)
{
- if (RegexCharClass.CharInClass(span[i], set, ref startingAsciiLookup))
+ if (RegexCharClass.CharInClass(span[i], primarySet.Set, ref startingAsciiLookup))
{
pos += i;
return true;
for (int inputPosition = pos; inputPosition <= endMinusRequiredLength; inputPosition++)
{
int offset = inputPosition + primarySet.Distance;
- int index = textSpan.Slice(offset).IndexOfAny(primarySet.Chars);
+ ReadOnlySpan<char> textSpanAtOffset = textSpan.Slice(offset);
+ int index = primarySet.Negated ? textSpanAtOffset.IndexOfAnyExcept(primarySet.Chars) : textSpanAtOffset.IndexOfAny(primarySet.Chars);
if (index < 0)
{
break;
return true;
}
}
+#endif
}
/// <summary>Mode to use for searching for the next location of a possible match.</summary>
/// A tuple of data about the literal: only one of the Char/String/SetChars fields is relevant.
/// The Negated value indicates whether the Char/SetChars should be considered exclusionary.
/// </returns>
- public StartingLiteralData? FindStartingLiteral(int maxSetCharacters = 5) // 5 is max optimized by IndexOfAny today
+ public StartingLiteralData? FindStartingLiteral(int maxSetCharacters = 5) // 5 is max efficiently optimized by IndexOfAny today
{
Debug.Assert(maxSetCharacters >= 0 && maxSetCharacters <= 128, $"{nameof(maxSetCharacters)} == {maxSetCharacters} should be small enough to be stack allocated.");
using System.Collections.Generic;
using System.Diagnostics;
-using System.Globalization;
using System.Runtime.CompilerServices;
using System.Threading;
// Find all fixed-distance sets.
var results = new List<RegexFindOptimizations.FixedDistanceSet>();
int distance = 0;
- TryFindFixedSets(root, results, ref distance, thorough);
+ TryFindRawFixedSets(root, results, ref distance, thorough);
+#if DEBUG
+ results.ForEach(r => Debug.Assert(
+ !r.Negated && r.Chars is null && r.AsciiSet is null && r.Range is null,
+ $"{nameof(TryFindRawFixedSets)} should have only populated {nameof(r.Set)} and {nameof(r.Distance)}"));
+#endif
// Remove any sets that match everything; they're not helpful. (This check exists primarily to weed
// out use of . in Singleline mode, but also filters out explicit sets like [\s\S].)
// For every entry, try to get the chars that make up the set, if there are few enough.
// For any for which we couldn't get the small chars list, see if we can get other useful info.
- Span<char> scratch = stackalloc char[5]; // max optimized by IndexOfAny today
+ Span<char> scratch = stackalloc char[5]; // max efficiently optimized by IndexOfAny today
for (int i = 0; i < results.Count; i++)
{
RegexFindOptimizations.FixedDistanceSet result = results[i];
int count = RegexCharClass.GetSetChars(result.Set, scratch);
- if (!result.Negated && count > 0)
+ if (count > 0)
{
result.Chars = scratch.Slice(0, count).ToArray();
}
// is at a fixed distance, in which case distance will have been updated to include the full length
// of the node. If it returns false, the node isn't entirely fixed, in which case subsequent nodes
// shouldn't be examined and distance should no longer be trusted. However, regardless of whether it
- // returns true or false, it may have populated results, and all populated results are valid.
- static bool TryFindFixedSets(RegexNode node, List<RegexFindOptimizations.FixedDistanceSet> results, ref int distance, bool thorough)
+ // returns true or false, it may have populated results, and all populated results are valid. All
+ // FixedDistanceSet result will only have its Set string and Distance populated; the rest is left
+ // to be populated by FindFixedDistanceSets after this returns.
+ static bool TryFindRawFixedSets(RegexNode node, List<RegexFindOptimizations.FixedDistanceSet> results, ref int distance, bool thorough)
{
if (!StackHelper.TryEnsureSufficientExecutionStack())
{
case RegexNodeKind.Atomic:
case RegexNodeKind.Group:
case RegexNodeKind.Capture:
- return TryFindFixedSets(node.Child(0), results, ref distance, thorough);
+ return TryFindRawFixedSets(node.Child(0), results, ref distance, thorough);
case RegexNodeKind.Lazyloop or RegexNodeKind.Loop when node.M > 0:
// This effectively only iterates the loop once. If deemed valuable,
// summed distance for all node.M iterations. If node.M == node.N,
// this would then also allow continued evaluation of the rest of the
// expression after the loop.
- TryFindFixedSets(node.Child(0), results, ref distance, thorough);
+ TryFindRawFixedSets(node.Child(0), results, ref distance, thorough);
return false;
case RegexNodeKind.Concatenate:
int childCount = node.ChildCount();
for (int i = 0; i < childCount; i++)
{
- if (!TryFindFixedSets(node.Child(i), results, ref distance, thorough))
+ if (!TryFindRawFixedSets(node.Child(i), results, ref distance, thorough))
{
return false;
}
{
localResults.Clear();
int localDistance = 0;
- allSameSize &= TryFindFixedSets(node.Child(i), localResults, ref localDistance, thorough);
+ allSameSize &= TryFindRawFixedSets(node.Child(i), localResults, ref localDistance, thorough);
if (localResults.Count == 0)
{
int s1RangeLength = s1.Range is not null ? GetRangeLength(s1.Range.Value, s1Negated) : 0;
int s2RangeLength = s2.Range is not null ? GetRangeLength(s2.Range.Value, s2Negated) : 0;
- Debug.Assert(!s1Negated || s1Chars is null);
- Debug.Assert(!s2Negated || s2Chars is null);
+ // If one set is negated and the other isn't, prefer the non-negated set. In general, negated
+ // sets are large and thus likely to match more frequently, making them slower to search for.
+ if (s1Negated != s2Negated)
+ {
+ return s2Negated ? -1 : 1;
+ }
- // If both have chars, prioritize the one with the smaller frequency for those chars.
- if (s1Chars is not null && s2Chars is not null)
+ // If we extracted only a few chars and the sets are negated, they both represent very large
+ // sets that are difficult to compare for quality.
+ if (!s1Negated)
{
- // Prefer sets with less frequent values. The frequency is only an approximation,
- // used as a tie-breaker when we'd otherwise effectively be picking randomly.
- // True frequencies will vary widely based on the actual data being searched, the language of the data, etc.
- float s1Frequency = SumFrequencies(s1Chars);
- float s2Frequency = SumFrequencies(s2Chars);
+ Debug.Assert(!s2Negated);
- if (s1Frequency != s2Frequency)
+ // If both have chars, prioritize the one with the smaller frequency for those chars.
+ if (s1Chars is not null && s2Chars is not null)
{
- return s1Frequency.CompareTo(s2Frequency);
- }
+ // Prefer sets with less frequent values. The frequency is only an approximation,
+ // used as a tie-breaker when we'd otherwise effectively be picking randomly.
+ // True frequencies will vary widely based on the actual data being searched, the language of the data, etc.
+ float s1Frequency = SumFrequencies(s1Chars);
+ float s2Frequency = SumFrequencies(s2Chars);
- if (!RegexCharClass.IsAscii(s1Chars) && !RegexCharClass.IsAscii(s2Chars))
- {
- // Prefer the set with fewer values.
- return s1CharsLength.CompareTo(s2CharsLength);
- }
+ if (s1Frequency != s2Frequency)
+ {
+ return s1Frequency.CompareTo(s2Frequency);
+ }
- [MethodImpl(MethodImplOptions.AggressiveInlining)]
- static float SumFrequencies(char[] chars)
- {
- float sum = 0;
- foreach (char c in chars)
+ if (!RegexCharClass.IsAscii(s1Chars) && !RegexCharClass.IsAscii(s2Chars))
+ {
+ // Prefer the set with fewer values.
+ return s1CharsLength.CompareTo(s2CharsLength);
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ static float SumFrequencies(char[] chars)
{
- // Lookup each character in the table. Values >= 128 are ignored
- // and thus we'll get skew in the data. It's already a gross approximation, though,
- // and it is primarily meant for disambiguation of ASCII letters.
- if (c < 128)
+ float sum = 0;
+ foreach (char c in chars)
{
- sum += Frequency[c];
+ // Lookup each character in the table. Values >= 128 are ignored
+ // and thus we'll get skew in the data. It's already a gross approximation, though,
+ // and it is primarily meant for disambiguation of ASCII letters.
+ if (c < 128)
+ {
+ sum += Frequency[c];
+ }
}
+ return sum;
}
- return sum;
}
- }
- // If one has chars and the other has a range, prefer the shorter set.
- if ((s1CharsLength > 0 && s2RangeLength > 0) || (s1RangeLength > 0 && s2CharsLength > 0))
- {
- int c = Math.Max(s1CharsLength, s1RangeLength).CompareTo(Math.Max(s2CharsLength, s2RangeLength));
- if (c != 0)
+ // If one has chars and the other has a range, prefer the shorter set.
+ if ((s1CharsLength > 0 && s2RangeLength > 0) || (s1RangeLength > 0 && s2CharsLength > 0))
{
- return c;
- }
+ int c = Math.Max(s1CharsLength, s1RangeLength).CompareTo(Math.Max(s2CharsLength, s2RangeLength));
+ if (c != 0)
+ {
+ return c;
+ }
- // If lengths are the same, prefer the chars.
- return s1CharsLength > 0 ? -1 : 1;
- }
+ // If lengths are the same, prefer the chars.
+ return s1CharsLength > 0 ? -1 : 1;
+ }
- // If one has chars and the other doesn't, prioritize the one with chars.
- if ((s1CharsLength > 0) != (s2CharsLength > 0))
- {
- return s1CharsLength > 0 ? -1 : 1;
+ // If one has chars and the other doesn't, prioritize the one with chars.
+ if ((s1CharsLength > 0) != (s2CharsLength > 0))
+ {
+ return s1CharsLength > 0 ? -1 : 1;
+ }
}
// If one has a range and the other doesn't, prioritize the one with a range.