{
RegexOptions options = (RegexOptions)rm.Options;
RegexCode code = rm.Code;
- (string CharClass, bool CaseInsensitive)[]? lcc = code.LeadingCharClasses;
bool rtl = code.RightToLeft;
bool hasTextInfo = false;
- bool textInfoEmitted = false;
// Emit locals initialization
writer.WriteLine("string runtext = base.runtext!;");
};
using (EmitBlock(writer, clause))
{
- EmitAnchors();
-
- if (code.BoyerMoorePrefix is RegexBoyerMoore { NegativeUnicode: null } rbm)
- {
- if (rbm.PatternSupportsIndexOf)
- {
- EmitIndexOf(rbm.Pattern);
- }
- else
- {
- EmitBoyerMoore(rbm);
- }
- }
- else if (lcc is not null)
+ // Emit any anchors.
+ if (!EmitAnchors())
{
- if (rtl)
- {
- EmitLeadingCharacter_RightToLeft();
- }
- else
+ // Either anchors weren't specified, or they don't completely root all matches to a specific location.
+
+ // If whatever search operation we need to perform entails case-insensitive operations
+ // that weren't already handled via creation of sets, we need to get an store the
+ // TextInfo object to use (unless RegexOptions.CultureInvariant was specified).
+ EmitTextInfo(writer, ref hasTextInfo, rm);
+
+ // Emit the code for whatever find mode has been determined.
+ switch (code.FindOptimizations.FindMode)
{
- EmitLeadingCharacter_LeftToRight();
+ case FindNextStartingPositionMode.LeadingPrefix_LeftToRight_CaseSensitive:
+ Debug.Assert(!string.IsNullOrEmpty(code.FindOptimizations.LeadingCaseSensitivePrefix));
+ EmitIndexOf_LeftToRight(code.FindOptimizations.LeadingCaseSensitivePrefix);
+ break;
+
+ case FindNextStartingPositionMode.LeadingPrefix_RightToLeft_CaseSensitive:
+ Debug.Assert(!string.IsNullOrEmpty(code.FindOptimizations.LeadingCaseSensitivePrefix));
+ EmitIndexOf_RightToLeft(code.FindOptimizations.LeadingCaseSensitivePrefix);
+ break;
+
+ case FindNextStartingPositionMode.FixedSets_LeftToRight_CaseSensitive:
+ case FindNextStartingPositionMode.FixedSets_LeftToRight_CaseInsensitive:
+ case FindNextStartingPositionMode.LeadingSet_LeftToRight_CaseSensitive:
+ case FindNextStartingPositionMode.LeadingSet_LeftToRight_CaseInsensitive:
+ Debug.Assert(code.FindOptimizations.FixedDistanceSets is { Count: > 0 });
+ EmitFixedSet_LeftToRight();
+ break;
+
+ case FindNextStartingPositionMode.LeadingSet_RightToLeft_CaseSensitive:
+ case FindNextStartingPositionMode.LeadingSet_RightToLeft_CaseInsensitive:
+ Debug.Assert(code.FindOptimizations.FixedDistanceSets is { Count: > 0 });
+ EmitFixedSet_RightToLeft();
+ break;
+
+ default:
+ Debug.Fail($"Unexpected mode: {code.FindOptimizations.FindMode}");
+ goto case FindNextStartingPositionMode.NoSearch;
+
+ case FindNextStartingPositionMode.NoSearch:
+ writer.WriteLine("return true;");
+ break;
}
}
- else
- {
- writer.WriteLine("return true;");
- }
}
writer.WriteLine();
writer.WriteLine(!rm.Code.RightToLeft ? "base.runtextpos = runtextend;" : "base.runtextpos = runtextbeg;");
writer.WriteLine("return false;");
- void EmitAnchors()
+ // Emits any anchors. Returns true if the anchor roots any match to a specific location and thus no further
+ // searching is required; otherwise, false.
+ bool EmitAnchors()
{
// Generate anchor checks.
- if ((code.LeadingAnchor & (RegexPrefixAnalyzer.Beginning | RegexPrefixAnalyzer.Start | RegexPrefixAnalyzer.EndZ | RegexPrefixAnalyzer.End | RegexPrefixAnalyzer.Bol)) != 0)
+ if ((code.FindOptimizations.LeadingAnchor & (RegexPrefixAnalyzer.Beginning | RegexPrefixAnalyzer.Start | RegexPrefixAnalyzer.EndZ | RegexPrefixAnalyzer.End | RegexPrefixAnalyzer.Bol)) != 0)
{
- // TODO: RegexInterpreter also factors in a Boyer-Moore prefix check in places Compiled just returns true.
- // Determine if we should do so here and in Compiled as well, and potentially update RegexInterpreter.
- // Interpreted and Compiled also differ in various places as to whether they update positions, as do LTR vs RTL. Determine why.
- switch (code.LeadingAnchor)
+ // TODO: Interpreted and Compiled differ in various places as to whether they update positions, as do LTR vs RTL. Determine why.
+ switch (code.FindOptimizations.LeadingAnchor)
{
case RegexPrefixAnalyzer.Beginning:
writer.WriteLine("// Beginning \\A anchor");
}
}
writer.WriteLine("return true;");
- return;
+ return true;
case RegexPrefixAnalyzer.Start:
writer.WriteLine("// Start \\G anchor");
}
}
writer.WriteLine("return true;");
- return;
+ return true;
case RegexPrefixAnalyzer.EndZ:
// TODO: Why are the LTR and RTL cases inconsistent here with RegexOptions.Compiled?
}
}
writer.WriteLine("return true;");
- return;
+ return true;
- case RegexPrefixAnalyzer.End when minRequiredLength == 0: // if it's > 0, we already output a more stringent check
+ case RegexPrefixAnalyzer.End:
writer.WriteLine("// End \\z anchor");
if (!rtl)
{
}
}
writer.WriteLine("return true;");
- return;
+ return true;
- case RegexPrefixAnalyzer.Bol when !rtl: // Don't bother optimizing for the niche case of RegexOptions.RightToLeft | RegexOptions.Multiline
+ case RegexPrefixAnalyzer.Bol:
// Optimize the handling of a Beginning-Of-Line (BOL) anchor. BOL is special, in that unlike
// other anchors like Beginning, there are potentially multiple places a BOL can match. So unlike
// the other anchors, which all skip all subsequent processing if found, with BOL we just use it
- // to boost our position to the next line, and then continue normally with any Boyer-Moore or
- // leading char class searches.
+ // to boost our position to the next line, and then continue normally with any searches.
+ Debug.Assert(!rtl, "RightToLeft isn't implemented and should have been filtered out previously");
writer.WriteLine("// Beginning-of-line anchor");
using (EmitBlock(writer, "if (runtextpos > runtextbeg && runtext[runtextpos - 1] != '\\n')"))
{
break;
}
}
- }
-
- void EmitBoyerMoore(RegexBoyerMoore rbm)
- {
- EmitTextInfoIfRequired(writer, ref textInfoEmitted, ref hasTextInfo, rm);
-
- int beforefirst;
- int last;
- if (!rtl)
- {
- //limitLocal = "runtextend";
- beforefirst = -1;
- last = rbm.Pattern.Length - 1;
- }
- else
- {
- //limitLocal = "runtextbeg";
- beforefirst = rbm.Pattern.Length;
- last = 0;
- }
-
- int chLast = rbm.Pattern[last];
-
- EmitAdd(writer, "runtextpos", !rtl ? rbm.Pattern.Length - 1 : -rbm.Pattern.Length);
-
- using (EmitBlock(writer, $"while ({(!rtl ? "runtextpos < runtextend" : "runtextpos >= runtextbeg")})"))
- {
- writer.WriteLine($"ch = {ToLowerIfNeeded(hasTextInfo, options, "runtext[runtextpos]", rbm.CaseInsensitive)};");
-
- using (EmitBlock(writer, $"if (ch != {Literal((char)chLast)})"))
- {
- writer.WriteLine($"ch -= {Literal((char)rbm.LowASCII)};");
- using (EmitBlock(writer, $"if ((uint)ch > ({Literal((char)rbm.HighASCII)} - {Literal((char)rbm.LowASCII)}))"))
- {
- EmitAdd(writer, "runtextpos", (!rtl ? rbm.Pattern.Length : -rbm.Pattern.Length));
- writer.WriteLine("continue;");
- }
-
- int negativeRange = rbm.HighASCII - rbm.LowASCII + 1;
- if (negativeRange > 1) // High > Low
- {
- // Create a string to store the lookup table we use to find the offset.
- // Store the offsets into the string. RightToLeft has negative offsets, so to support it with chars (unsigned), we negate
- // the values to be stored in the string, and then at run time after looking up the offset in the string, negate it again.
- Debug.Assert(rbm.Pattern.Length <= char.MaxValue, "RegexBoyerMoore should have limited the size allowed.");
- Span<char> span = new char[negativeRange];
- for (int i = 0; i < span.Length; i++)
- {
- int offset = rbm.NegativeASCII[i + rbm.LowASCII];
- if (offset == beforefirst)
- {
- offset = rbm.Pattern.Length;
- }
- else if (rtl)
- {
- offset = -offset;
- }
- Debug.Assert(offset >= 0 && offset <= char.MaxValue);
- span[i] = (char)offset;
- }
-
- writer.WriteLine($"runtextpos {(rtl ? "-=" : "+=")} {Literal(span.ToString())}[ch];");
- }
- else
- {
- Debug.Assert(negativeRange == 1); // High == Low
- int offset = rbm.NegativeASCII[rbm.LowASCII];
- if (offset == beforefirst)
- {
- offset = rtl ? -rbm.Pattern.Length : rbm.Pattern.Length;
- }
- EmitAdd(writer, "runtextpos", offset);
- }
- writer.WriteLine("continue;");
- }
- writer.WriteLine();
- writer.WriteLine("int test = runtextpos;");
- writer.WriteLine();
-
- for (int i = rbm.Pattern.Length - 2; i >= 0; i--)
- {
- int charIndex = !rtl ? i : rbm.Pattern.Length - 1 - i;
- bool sameAsPrev = i < rbm.Pattern.Length - 2 && rbm.Positive[charIndex] == rbm.Positive[!rtl ? i + 1 : rbm.Pattern.Length - 1 - (i + 1)];
- bool sameAsNext = i > 0 && rbm.Positive[charIndex] == rbm.Positive[!rtl ? i - 1 : rbm.Pattern.Length - 1 - (i - 1)];
-
- string condition = $"{ToLowerIfNeeded(hasTextInfo, options, (!rtl ? "runtext[--test]" : "runtext[++test]"), rbm.CaseInsensitive && RegexCharClass.ParticipatesInCaseConversion(rbm.Pattern[charIndex]))} != {Literal(rbm.Pattern[charIndex])}";
- switch ((sameAsPrev, sameAsNext))
- {
- case (true, true):
- writer.WriteLine($" {condition} ||");
- break;
-
- case (false, true):
- writer.WriteLine($"if ({condition} ||");
- break;
-
- case (true, false):
- writer.WriteLine($" {condition})");
- using (EmitBlock(writer, null))
- {
- EmitAdd(writer, "runtextpos", rbm.Positive[charIndex]);
- writer.WriteLine("continue;");
- }
- writer.WriteLine();
- break;
- case (false, false):
- using (EmitBlock(writer, $"if ({condition})"))
- {
- EmitAdd(writer, "runtextpos", rbm.Positive[charIndex]);
- writer.WriteLine("continue;");
- }
- writer.WriteLine();
- break;
- }
- }
-
- writer.WriteLine(!rtl ?
- "base.runtextpos = test;" :
- "base.runtextpos = test + 1;");
- writer.WriteLine("return true;");
- }
+ return false;
}
- void EmitIndexOf(string prefix)
+ // Emits a case-sensitive left-to-right prefix search for a string at the beginning of the pattern.
+ void EmitIndexOf_LeftToRight(string prefix)
{
writer.WriteLine($"int i = global::System.MemoryExtensions.IndexOf(global::System.MemoryExtensions.AsSpan(runtext, runtextpos, runtextend - runtextpos), {Literal(prefix)});");
writer.WriteLine("if (i >= 0)");
writer.WriteLine("}");
}
- void EmitLeadingCharacter_RightToLeft()
+ // Emits a case-sensitive right-to-left prefix search for a string at the beginning of the pattern.
+ void EmitIndexOf_RightToLeft(string prefix)
{
- EmitTextInfoIfRequired(writer, ref textInfoEmitted, ref hasTextInfo, rm);
+ writer.WriteLine($"int i = global::System.MemoryExtensions.LastIndexOf(global::System.MemoryExtensions.AsSpan(runtext, runtextbeg, runtextpos - runtextbeg), {Literal(prefix)});");
+ writer.WriteLine("if (i >= 0)");
+ writer.WriteLine("{");
+ writer.WriteLine($" base.runtextpos = runtextbeg + i + {prefix.Length};");
+ writer.WriteLine(" return true;");
+ writer.WriteLine("}");
+ }
- Debug.Assert(lcc.Length == 1, "Only the FirstChars and not MultiFirstChars computation is supported for RightToLeft");
- string set = lcc[0].CharClass;
- if (RegexCharClass.IsSingleton(set))
+ // Emits a right-to-left search for a set at a fixed position from the start of the pattern.
+ // (Currently that position will always be a distance of 0, meaning the start of the pattern itself.)
+ void EmitFixedSet_RightToLeft()
+ {
+ (char[]? Chars, string Set, int Distance, bool CaseInsensitive) set = code.FindOptimizations.FixedDistanceSets![0];
+ Debug.Assert(set.Distance == 0);
+
+ if (set.Chars is { Length: 1 } && !set.CaseInsensitive)
{
- char ch = RegexCharClass.SingletonChar(set);
- using (EmitBlock(writer, "for (int i = runtextpos - 1; i >= runtextbeg; i--)"))
- {
- using (EmitBlock(writer, $"if (runtext[i] == {ToLowerIfNeeded(hasTextInfo, options, Literal(ch), lcc[0].CaseInsensitive)})"))
- {
- writer.WriteLine("base.runtextpos = i + 1;");
- writer.WriteLine("return true;");
- }
- }
+ writer.WriteLine($"int i = global::System.MemoryExtensions.LastIndexOf(global::System.MemoryExtensions.AsSpan(runtext, runtextbeg, runtextpos - runtextbeg), {Literal(set.Chars[0])});");
+ writer.WriteLine("if (i >= 0)");
+ writer.WriteLine("{");
+ writer.WriteLine(" base.runtextpos = runtextbeg + i + 1;");
+ writer.WriteLine(" return true;");
+ writer.WriteLine("}");
}
else
{
using (EmitBlock(writer, "for (int i = runtextpos - 1; i >= runtextbeg; i--)"))
{
- using (EmitBlock(writer, $"if ({MatchCharacterClass(hasTextInfo, options, "runtext[i]", set, lcc[0].CaseInsensitive)})"))
+ using (EmitBlock(writer, $"if ({MatchCharacterClass(hasTextInfo, options, "runtext[i]", set.Set, set.CaseInsensitive)})"))
{
- writer.WriteLine("runtextpos = i + 1;");
+ writer.WriteLine("base.runtextpos = i + 1;");
writer.WriteLine("return true;");
}
}
}
}
- void EmitLeadingCharacter_LeftToRight()
+ // Emits a left-to-right search for a set at a fixed position from the start of the pattern,
+ // and potentially other sets at other fixed positions in the pattern.
+ void EmitFixedSet_LeftToRight()
{
- Debug.Assert(lcc is not null && lcc.Length > 0);
-
- // If minRequiredLength > 0, we already output a more stringent check. In the rare case
- // where we were unable to get an accurate enough min required length to ensure it's larger
- // than the prefixes we calculated, we also need to ensure we have enough space for those,
- // as they also represent a min required length.
- if (minRequiredLength < lcc.Length)
- {
- writer.WriteLine($"// Validate at least {lcc.Length} characters are available to match");
- string endExpr = lcc.Length > 1 ? $"runtextend - {lcc.Length - 1}" : "runtextend";
- using (EmitBlock(writer, $"if (runtextpos >= {endExpr})"))
- {
- writer.WriteLine("goto ReturnFalse;");
- }
- writer.WriteLine();
- }
-
- writer.WriteLine("global::System.ReadOnlySpan<char> span = global::System.MemoryExtensions.AsSpan(runtext, runtextpos, runtextend - runtextpos);");
+ List<(char[]? Chars, string Set, int Distance, bool CaseInsensitive)>? sets = code.FindOptimizations.FixedDistanceSets;
+ (char[]? Chars, string Set, int Distance, bool CaseInsensitive) primarySet = sets![0];
+ const int MaxSets = 4;
+ int setsToUse = Math.Min(sets.Count, MaxSets);
// If we can use IndexOf{Any}, try to accelerate the skip loop via vectorization to match the first prefix.
// We can use it if this is a case-sensitive class with a small number of characters in the class.
- Span<char> setChars = stackalloc char[3]; // up to 3 characters handled by IndexOf{Any} below
- int setCharsCount = 0, charClassIndex = 0;
- bool canUseIndexOf =
- !lcc[0].CaseInsensitive &&
- (setCharsCount = RegexCharClass.GetSetChars(lcc[0].CharClass, setChars)) > 0 &&
- !RegexCharClass.IsNegated(lcc[0].CharClass);
- bool needLoop = !canUseIndexOf || lcc.Length > 1;
+ int setIndex = 0;
+ bool canUseIndexOf = !primarySet.CaseInsensitive && primarySet.Chars is not null;
+ bool needLoop = !canUseIndexOf || setsToUse > 1;
FinishEmitScope loopBlock = default;
if (needLoop)
{
- EmitTextInfoIfRequired(writer, ref textInfoEmitted, ref hasTextInfo, rm);
- writer.WriteLine();
- string upperBound = lcc.Length > 1 ? $"span.Length - {lcc.Length - 1}" : "span.Length";
+ writer.WriteLine("global::System.ReadOnlySpan<char> span = global::System.MemoryExtensions.AsSpan(runtext, runtextpos, runtextend - runtextpos);");
+ string upperBound = "span.Length" + (setsToUse > 1 || primarySet.Distance != 0 ? $" - {minRequiredLength - 1}" : "");
loopBlock = EmitBlock(writer, $"for (int i = 0; i < {upperBound}; i++)");
}
if (canUseIndexOf)
{
- charClassIndex = 1;
+ string span = needLoop ?
+ "span" :
+ "global::System.MemoryExtensions.AsSpan(runtext, runtextpos, runtextend - runtextpos)";
- string span = needLoop ? "span.Slice(i)" : "span";
- string indexOf = setCharsCount switch
+ span = (needLoop, primarySet.Distance) switch
{
- 1 => $"global::System.MemoryExtensions.IndexOf({span}, {Literal(setChars[0])})",
- 2 => $"global::System.MemoryExtensions.IndexOfAny({span}, {Literal(setChars[0])}, {Literal(setChars[1])})",
- _ => $"global::System.MemoryExtensions.IndexOfAny({span}, {Literal(setChars[0])}, {Literal(setChars[1])}, {Literal(setChars[2])})",
+ (false, 0) => span,
+ (true, 0) => $"{span}.Slice(i)",
+ (false, _) => $"{span}.Slice({primarySet.Distance})",
+ (true, _) => $"{span}.Slice(i + {primarySet.Distance})",
+ };
+
+ string indexOf = primarySet.Chars!.Length switch
+ {
+ 1 => $"global::System.MemoryExtensions.IndexOf({span}, {Literal(primarySet.Chars[0])})",
+ 2 => $"global::System.MemoryExtensions.IndexOfAny({span}, {Literal(primarySet.Chars[0])}, {Literal(primarySet.Chars[1])})",
+ 3 => $"global::System.MemoryExtensions.IndexOfAny({span}, {Literal(primarySet.Chars[0])}, {Literal(primarySet.Chars[1])}, {Literal(primarySet.Chars[2])})",
+ _ => $"global::System.MemoryExtensions.IndexOfAny({span}, {Literal(new string(primarySet.Chars))})",
};
if (needLoop)
writer.WriteLine("i += indexOfPos;");
writer.WriteLine();
- if (lcc.Length > 1)
+ if (setsToUse > 1)
{
- using (EmitBlock(writer, $"if (i >= span.Length - {lcc.Length - 1})"))
+ using (EmitBlock(writer, $"if (i >= span.Length - {minRequiredLength - 1})"))
{
writer.WriteLine("goto ReturnFalse;");
}
+ writer.WriteLine();
}
}
else
{
writer.WriteLine($"int i = {indexOf};");
- using (EmitBlock(writer, "if (i < 0)"))
+ using (EmitBlock(writer, "if (i >= 0)"))
{
- writer.WriteLine("goto ReturnFalse;");
+ writer.WriteLine("base.runtextpos = runtextpos + i;");
+ writer.WriteLine("return true;");
}
}
- writer.WriteLine();
+
+ setIndex = 1;
}
- Debug.Assert(charClassIndex == 0 || charClassIndex == 1);
- bool hasCharClassConditions = false;
- if (charClassIndex < lcc.Length)
+ if (needLoop)
{
- // if (CharInClass(textSpan[i + charClassIndex], prefix[0], "...") &&
- // ...)
- Debug.Assert(needLoop);
- int start = charClassIndex;
- for (; charClassIndex < lcc.Length; charClassIndex++)
+ Debug.Assert(setIndex == 0 || setIndex == 1);
+ bool hasCharClassConditions = false;
+ if (setIndex < setsToUse)
{
- string spanIndex = charClassIndex > 0 ? $"span[i + {charClassIndex}]" : "span[i]";
- string charInClassExpr = MatchCharacterClass(hasTextInfo, options, spanIndex, lcc[charClassIndex].CharClass, lcc[charClassIndex].CaseInsensitive);
-
- if (charClassIndex == start)
+ // if (CharInClass(textSpan[i + charClassIndex], prefix[0], "...") &&
+ // ...)
+ Debug.Assert(needLoop);
+ int start = setIndex;
+ for (; setIndex < setsToUse; setIndex++)
{
- writer.Write($"if ({charInClassExpr}");
- }
- else
- {
- writer.WriteLine(" &&");
- writer.Write($" {charInClassExpr}");
+ string spanIndex = $"span[i{(sets[setIndex].Distance > 0 ? $" + {sets[setIndex].Distance}" : "")}]";
+ string charInClassExpr = MatchCharacterClass(hasTextInfo, options, spanIndex, sets[setIndex].Set, sets[setIndex].CaseInsensitive);
+
+ if (setIndex == start)
+ {
+ writer.Write($"if ({charInClassExpr}");
+ }
+ else
+ {
+ writer.WriteLine(" &&");
+ writer.Write($" {charInClassExpr}");
+ }
}
+ writer.WriteLine(")");
+ hasCharClassConditions = true;
}
- writer.WriteLine(")");
- hasCharClassConditions = true;
- }
- using (hasCharClassConditions ? EmitBlock(writer, null) : default)
- {
- writer.WriteLine("base.runtextpos = runtextpos + i;");
- writer.WriteLine("return true;");
+ using (hasCharClassConditions ? EmitBlock(writer, null) : default)
+ {
+ writer.WriteLine("base.runtextpos = runtextpos + i;");
+ writer.WriteLine("return true;");
+ }
}
loopBlock.Dispose();
}
+
+ // If a TextInfo is needed to perform ToLower operations, emits a local initialized to the TextInfo to use.
+ static void EmitTextInfo(IndentedTextWriter writer, ref bool hasTextInfo, RegexMethod rm)
+ {
+ // Emit local to store current culture if needed
+ if ((rm.Options & RegexOptions.CultureInvariant) == 0)
+ {
+ bool needsCulture = rm.Code.FindOptimizations.FindMode switch
+ {
+ FindNextStartingPositionMode.FixedLiteral_LeftToRight_CaseInsensitive or
+ FindNextStartingPositionMode.LeadingLiteral_RightToLeft_CaseInsensitive or
+ FindNextStartingPositionMode.FixedSets_LeftToRight_CaseInsensitive or
+ FindNextStartingPositionMode.LeadingSet_LeftToRight_CaseInsensitive or
+ FindNextStartingPositionMode.LeadingSet_RightToLeft_CaseInsensitive => true,
+
+ _ when rm.Code.FindOptimizations.FixedDistanceSets is List<(char[]? Chars, string Set, int Distance, bool CaseInsensitive)> sets => sets.Exists(set => set.CaseInsensitive),
+
+ _ => false,
+ };
+
+ if (needsCulture)
+ {
+ hasTextInfo = true;
+ writer.WriteLine("global::System.Globalization.TextInfo textInfo = global::System.Globalization.CultureInfo.CurrentCulture.TextInfo;");
+ }
+ }
+ }
}
/// <summary>Emits the body of the Go override.</summary>
/// <summary>Emits the body of a simplified Go implementation that's possible when there's minimal backtracking required by the expression.</summary>
private static void EmitSimplifiedGo(IndentedTextWriter writer, RegexMethod rm, string id)
{
+ // Arbitrary limit for unrolling vs creating a loop. We want to balance size in the generated
+ // code with other costs, like the (small) overhead of slicing to create the temp span to iterate.
+ const int MaxUnrollSize = 16;
+
RegexOptions options = (RegexOptions)rm.Options;
RegexCode code = rm.Code;
- (string CharClass, bool CaseInsensitive)[]? lcc = code.LeadingCharClasses;
bool rtl = code.RightToLeft;
bool hasTimeout = false;
break;
case RegexNode.Concatenate:
- int childCount = node.ChildCount();
- for (int i = 0; i < childCount; i++)
- {
- if (emitLengthChecksIfRequired && node.TryGetJoinableLengthCheckChildRange(i, out int requiredLength, out int exclusiveEnd))
- {
- EmitSpanLengthCheck(requiredLength);
- writer.WriteLine();
-
- for (; i < exclusiveEnd; i++)
- {
- EmitNode(node.Child(i), i + 1 < childCount ? node.Child(i + 1) : subsequent, emitLengthChecksIfRequired: false);
- }
-
- i--;
- }
- else
- {
- EmitNode(node.Child(i), i + 1 < childCount ? node.Child(i + 1) : subsequent, emitLengthChecksIfRequired: emitLengthChecksIfRequired);
- }
- }
+ EmitConcatenation(node, subsequent, emitLengthChecksIfRequired);
break;
case RegexNode.Capture:
writer.WriteLine("base.runtextpos = runtextpos;");
}
+ void EmitConcatenation(RegexNode node, RegexNode? subsequent, bool emitLengthChecksIfRequired)
+ {
+ int childCount = node.ChildCount();
+ for (int i = 0; i < childCount; i++)
+ {
+ if (emitLengthChecksIfRequired && node.TryGetJoinableLengthCheckChildRange(i, out int requiredLength, out int exclusiveEnd))
+ {
+ bool wroteClauses = true;
+ writer.Write($"if ({SpanLengthCheck(requiredLength)}");
+
+ while (i < exclusiveEnd)
+ {
+ for (; i < exclusiveEnd; i++)
+ {
+ void WriteSingleCharChild(RegexNode child)
+ {
+ if (wroteClauses)
+ {
+ writer.WriteLine(" ||");
+ writer.Write(" ");
+ }
+ else
+ {
+ writer.Write("if (");
+ }
+ EmitSingleChar(child, emitLengthCheck: false, clauseOnly: true);
+ wroteClauses = true;
+ }
+
+ RegexNode child = node.Child(i);
+ if (child.Type is RegexNode.One or RegexNode.Notone or RegexNode.Set)
+ {
+ WriteSingleCharChild(child);
+ writer.Write($" /* {DescribeNode(child)} */");
+ }
+ else if (child.Type is RegexNode.Oneloop or RegexNode.Onelazy or RegexNode.Oneloopatomic or
+ RegexNode.Setloop or RegexNode.Setlazy or RegexNode.Setloopatomic or
+ RegexNode.Notoneloop or RegexNode.Notonelazy or RegexNode.Notoneloopatomic &&
+ child.M == child.N &&
+ child.M <= MaxUnrollSize)
+ {
+ for (int c = 0; c < child.M; c++)
+ {
+ WriteSingleCharChild(child);
+ if (c == 0)
+ {
+ writer.Write($" /* {DescribeNode(child)} */");
+ }
+ }
+ }
+ else
+ {
+ break;
+ }
+ }
+
+ if (wroteClauses)
+ {
+ writer.WriteLine(")");
+ using (EmitBlock(writer, null))
+ {
+ writer.WriteLine($"goto {doneLabel};");
+ }
+ wroteClauses = false;
+ }
+
+ if (i < exclusiveEnd)
+ {
+ writer.WriteLine();
+ EmitNode(node.Child(i), i + 1 < childCount ? node.Child(i + 1) : subsequent, emitLengthChecksIfRequired: false);
+ i++;
+ }
+ }
+
+ i--;
+ }
+ else
+ {
+ EmitNode(node.Child(i), i + 1 < childCount ? node.Child(i + 1) : subsequent, emitLengthChecksIfRequired: emitLengthChecksIfRequired);
+ }
+ }
+ }
+
// Emits the code to handle a single-character match.
- void EmitSingleChar(RegexNode node, bool emitLengthCheck = true, string? offset = null)
+ void EmitSingleChar(RegexNode node, bool emitLengthCheck = true, string? offset = null, bool clauseOnly = false)
{
// This only emits a single check, but it's called from the looping constructs in a loop
// to generate the code for a single check, so we map those looping constructs to the
}
else
{
- expr = ToLowerIfNeeded(hasTextInfo, options, expr, IsCaseInsensitive(node) && RegexCharClass.ParticipatesInCaseConversion(node.Ch));
+ expr = ToLowerIfNeeded(hasTextInfo, options, expr, IsCaseInsensitive(node));
expr = $"{expr} {(node.IsOneFamily ? "!=" : "==")} {Literal(node.Ch)}";
}
- using (EmitBlock(writer, emitLengthCheck ? $"if ({SpanLengthCheck(1, offset)} || {expr})" : $"if ({expr})"))
+ if (clauseOnly)
{
- writer.WriteLine($"goto {doneLabel};");
+ writer.Write(expr);
+ }
+ else
+ {
+ using (EmitBlock(writer, emitLengthCheck ? $"if ({SpanLengthCheck(1, offset)} || {expr})" : $"if ({expr})"))
+ {
+ writer.WriteLine($"goto {doneLabel};");
+ }
}
textSpanPos++;
EmitSpanLengthCheck(iterations);
}
- // Arbitrary limit for unrolling vs creating a loop. We want to balance size in the generated
- // code with other costs, like the (small) overhead of slicing to create the temp span to iterate.
- const int MaxUnrollSize = 16;
-
if (iterations <= MaxUnrollSize)
{
// if (textSpan[textSpanPos] != c1 ||
int minIterations = node.M;
int maxIterations = node.N;
- Span<char> setChars = stackalloc char[3]; // 3 is max we can use with IndexOfAny
+ Span<char> setChars = stackalloc char[5]; // 5 is max optimized by IndexOfAny today
int numSetChars = 0;
string iterationLocal = NextLocalName("i");
if (node.IsNotoneFamily &&
maxIterations == int.MaxValue &&
- (!IsCaseInsensitive(node) || !RegexCharClass.ParticipatesInCaseConversion(node.Ch)))
+ (!IsCaseInsensitive(node)))
{
// For Notone, we're looking for a specific character, as everything until we find
// it is consumed by the loop. If we're unbounded, such as with ".*" and if we're case-sensitive,
else if (node.IsSetFamily &&
maxIterations == int.MaxValue &&
!IsCaseInsensitive(node) &&
- (numSetChars = RegexCharClass.GetSetChars(node.Str!, setChars)) > 1 &&
+ (numSetChars = RegexCharClass.GetSetChars(node.Str!, setChars)) != 0 &&
RegexCharClass.IsNegated(node.Str!))
{
- // If the set is negated and contains only 2 or 3 characters (if it contained 1 and was negated, it would
+ // If the set is negated and contains only a few characters (if it contained 1 and was negated, it should
// have been reduced to a Notone), we can use an IndexOfAny to find any of the target characters.
// As with the notoneloopatomic above, the unbounded constraint is purely for simplicity.
+ Debug.Assert(numSetChars > 1);
writer.Write($"int {iterationLocal} = global::System.MemoryExtensions.IndexOfAny({textSpanLocal}");
if (textSpanPos != 0)
{
writer.Write($".Slice({textSpanPos})");
}
- writer.WriteLine(numSetChars == 2 ?
- $", {Literal(setChars[0])}, {Literal(setChars[1])});" :
- $", {Literal(setChars[0])}, {Literal(setChars[1])}, {Literal(setChars[2])});");
+ writer.WriteLine(numSetChars switch
+ {
+ 2 => $", {Literal(setChars[0])}, {Literal(setChars[1])});",
+ 3 => $", {Literal(setChars[0])}, {Literal(setChars[1])}, {Literal(setChars[2])});",
+ _ => $", {Literal(setChars.Slice(0, numSetChars).ToString())});",
+ });
using (EmitBlock(writer, $"if ({iterationLocal} == -1)"))
{
writer.WriteLine(textSpanPos > 0 ?
}
else
{
- expr = ToLowerIfNeeded(hasTextInfo, options, expr, IsCaseInsensitive(node) && RegexCharClass.ParticipatesInCaseConversion(node.Ch));
+ expr = ToLowerIfNeeded(hasTextInfo, options, expr, IsCaseInsensitive(node));
expr = $"{expr} {(node.IsOneFamily ? "==" : "!=")} {Literal(node.Ch)}";
}
}
else
{
- expr = ToLowerIfNeeded(hasTextInfo, options, expr, IsCaseInsensitive(node) && RegexCharClass.ParticipatesInCaseConversion(node.Ch));
+ expr = ToLowerIfNeeded(hasTextInfo, options, expr, IsCaseInsensitive(node));
expr = $"{expr} {(node.IsOneFamily ? "==" : "!=")} {Literal(node.Ch)}";
}
clause += Code() == RegexCode.Set ?
$"!{MatchCharacterClass(hasTextInfo, options, expr, rm.Code.Strings[Operand(0)], IsCaseInsensitive())}" :
- $"{ToLowerIfNeeded(hasTextInfo, options, expr, IsCaseInsensitive() && RegexCharClass.ParticipatesInCaseConversion(Operand(0)))} {(Code() == RegexCode.One ? "!=" : "==")} {Operand(0)}";
+ $"{ToLowerIfNeeded(hasTextInfo, options, expr, IsCaseInsensitive())} {(Code() == RegexCode.One ? "!=" : "==")} {Operand(0)}";
using (EmitBlock(writer, $"if ({clause})"))
{
writer.WriteLine($"if (runtextend - runtextpos < {str.Length} ||");
for (int i = 0; i < str.Length; i++)
{
- writer.Write($" {ToLowerIfNeeded(hasTextInfo, options, $"runtext[runtextpos{(i == 0 ? "" : $" + {i}")}]", IsCaseInsensitive() && RegexCharClass.ParticipatesInCaseConversion(str[i]))} != {Literal(str[i])}");
+ writer.Write($" {ToLowerIfNeeded(hasTextInfo, options, $"runtext[runtextpos{(i == 0 ? "" : $" + {i}")}]", IsCaseInsensitive())} != {Literal(str[i])}");
writer.WriteLine(i < str.Length - 1 ? " ||" : ")");
}
using (EmitBlock(writer, null))
for (int i = str.Length; i > 0;)
{
i--;
- writer.Write($" {ToLowerIfNeeded(hasTextInfo, options, $"runtext[runtextpos - {str.Length - i}]", IsCaseInsensitive() && RegexCharClass.ParticipatesInCaseConversion(str[i]))} != {Literal(str[i])}");
+ writer.Write($" {ToLowerIfNeeded(hasTextInfo, options, $"runtext[runtextpos - {str.Length - i}]", IsCaseInsensitive())} != {Literal(str[i])}");
writer.WriteLine(i == 0 ? ")" : " ||");
}
using (EmitBlock(writer, null))
}
else
{
- expr = ToLowerIfNeeded(hasTextInfo, options, expr, IsCaseInsensitive() && RegexCharClass.ParticipatesInCaseConversion(Operand(0)));
+ expr = ToLowerIfNeeded(hasTextInfo, options, expr, IsCaseInsensitive());
expr = $"{expr} {(Code() == RegexCode.Onerep ? "!=" : "==")} {Literal((char)Operand(0))}";
}
}
else
{
- expr = ToLowerIfNeeded(hasTextInfo, options, expr, IsCaseInsensitive() && RegexCharClass.ParticipatesInCaseConversion(Operand(0)));
+ expr = ToLowerIfNeeded(hasTextInfo, options, expr, IsCaseInsensitive());
string op = Code() == RegexCode.Onerep ? "!=" : "==";
using (EmitBlock(writer, $"if ({expr} {op} {Literal((char)Operand(0))})"))
{
}
string? set = Code() == RegexCode.Setloop || Code() == RegexCode.Setloopatomic ? rm.Code.Strings[Operand(0)] : null;
- Span<char> setChars = stackalloc char[3];
+ Span<char> setChars = stackalloc char[5]; // max optimized by IndexOfAny today
int numSetChars;
// If this is a notoneloop{atomic} and we're left-to-right and case-sensitive,
// we can use the vectorized IndexOf to search for the target character.
if ((Code() == RegexCode.Notoneloop || Code() == RegexCode.Notoneloopatomic) &&
!IsRightToLeft() &&
- (!IsCaseInsensitive() || !RegexCharClass.ParticipatesInCaseConversion(Operand(0))))
+ !IsCaseInsensitive())
{
writer.WriteLine($"{I} = global::System.MemoryExtensions.IndexOf(global::System.MemoryExtensions.AsSpan(runtext, runtextpos, {Len}), {Literal((char)Operand(0))}); // i");
using (EmitBlock(writer, $"if ({I} == -1)"))
else if ((Code() == RegexCode.Setloop || Code() == RegexCode.Setloopatomic) &&
!IsRightToLeft() &&
!IsCaseInsensitive() &&
- (numSetChars = RegexCharClass.GetSetChars(set!, setChars)) > 1 &&
+ (numSetChars = RegexCharClass.GetSetChars(set!, setChars)) != 0 &&
RegexCharClass.IsNegated(set!))
{
// Similarly, if this is a setloop{atomic} and we're left-to-right and case-sensitive,
- // and if the set contains only 2 or 3 negated chars, we can use the vectorized IndexOfAny
+ // and if the set contains only a few negated chars, we can use the vectorized IndexOfAny
// to search for those chars.
-
- Debug.Assert(numSetChars is 2 or 3);
- writer.Write($"{I} = global::System.MemoryExtensions.IndexOfAny(global::System.MemoryExtensions.AsSpan(runtext, runtextpos, {Len}), {Literal(setChars[0])}, {Literal(setChars[1])}");
- if (numSetChars == 3)
+ Debug.Assert(numSetChars > 1);
+ writer.WriteLine(numSetChars switch
{
- writer.Write($", {Literal(setChars[2])}");
- }
- writer.WriteLine("); // i");
+ 2 => $"{I} = global::System.MemoryExtensions.IndexOfAny(global::System.MemoryExtensions.AsSpan(runtext, runtextpos, {Len}), {Literal(setChars[0])}, {Literal(setChars[1])}); // i",
+ 3 => $"{I} = global::System.MemoryExtensions.IndexOfAny(global::System.MemoryExtensions.AsSpan(runtext, runtextpos, {Len}), {Literal(setChars[0])}, {Literal(setChars[1])}, {Literal(setChars[2])}); // i",
+ _ => $"{I} = global::System.MemoryExtensions.IndexOfAny(global::System.MemoryExtensions.AsSpan(runtext, runtextpos, {Len}), {Literal(setChars.Slice(0, numSetChars).ToString())}); // i",
+ });
using (EmitBlock(writer, $"if ({I} == -1)"))
{
writer.WriteLine($"runtextpos += {Len};");
else
{
string op = Code() == RegexCode.Oneloop || Code() == RegexCode.Oneloopatomic ? "!=" : "==";
- expr = ToLowerIfNeeded(hasTextInfo, options, expr, IsCaseInsensitive() && RegexCharClass.ParticipatesInCaseConversion(Operand(0)));
+ expr = ToLowerIfNeeded(hasTextInfo, options, expr, IsCaseInsensitive());
expr = $"{expr} {op} {Literal((char)Operand(0))}";
}
}
else
{
- expr = ToLowerIfNeeded(hasTextInfo, options, expr, IsCaseInsensitive() && RegexCharClass.ParticipatesInCaseConversion(Operand(0)));
+ expr = ToLowerIfNeeded(hasTextInfo, options, expr, IsCaseInsensitive());
expr = $"{expr} {(Code() == RegexCode.Onelazy ? "!=" : "==")} {Literal((char)Operand(0))}";
}
private static string ToLowerIfNeeded(bool hasTextInfo, RegexOptions options, string expression, bool toLower) => toLower ? ToLower(hasTextInfo, options, expression) : expression;
- private static void EmitTextInfoIfRequired(IndentedTextWriter writer, ref bool textInfoEmitted, ref bool hasTextInfo, RegexMethod rm)
- {
- if (textInfoEmitted)
- {
- return;
- }
- textInfoEmitted = true;
-
- // Emit local to store current culture if needed
- if ((((RegexOptions)rm.Options) & RegexOptions.CultureInvariant) == 0)
- {
- bool needsCulture = (((RegexOptions)rm.Options) & RegexOptions.IgnoreCase) != 0 || rm.Code.BoyerMoorePrefix?.CaseInsensitive == true;
- if (!needsCulture && rm.Code.LeadingCharClasses is not null)
- {
- for (int i = 0; i < rm.Code.LeadingCharClasses.Length; i++)
- {
- if (rm.Code.LeadingCharClasses[i].CaseInsensitive)
- {
- needsCulture = true;
- break;
- }
- }
- }
-
- if (needsCulture)
- {
- hasTextInfo = true;
- writer.WriteLine("// IgnoreCase with CultureInfo.CurrentCulture");
- writer.WriteLine("global::System.Globalization.TextInfo textInfo = global::System.Globalization.CultureInfo.CurrentCulture.TextInfo;");
- writer.WriteLine();
- }
- }
- }
-
private static string MatchCharacterClass(bool hasTextInfo, RegexOptions options, string chExpr, string charClass, bool caseInsensitive)
{
// We need to perform the equivalent of calling RegexRunner.CharInClass(ch, charClass),
return $"(char.GetUnicodeCategory({chExpr}) {(negated ? "!=" : "==")} global::System.Globalization.UnicodeCategory.{category})";
}
- // Next, if there's only 2 or 3 chars in the set (fairly common due to the sets we create for prefixes),
- // it's cheaper and smaller to compare against each than it is to use a lookup table.
- if (!invariant)
+ // Next, if there's only 2, 3, or 4 chars in the set (fairly common due to the sets we create for prefixes),
+ // it may be cheaper and smaller to compare against each than it is to use a lookup table. We can also special-case
+ // the very common case with case insensitivity of two characters next to each other being the upper and lowercase
+ // ASCII variants of each other, in which case we can use bit manipulation to avoid a comparison.
+ if (!invariant && !RegexCharClass.IsNegated(charClass))
{
- Span<char> setChars = stackalloc char[3];
- int numChars = RegexCharClass.GetSetChars(charClass, setChars);
- if (!RegexCharClass.IsNegated(charClass))
+ Span<char> setChars = stackalloc char[4];
+ switch (RegexCharClass.GetSetChars(charClass, setChars))
{
- switch (numChars)
- {
- case 2:
- return $"(((ch = {chExpr}) == {Literal(setChars[0])}) | (ch == {Literal(setChars[1])}))";
+ case 2:
+ return (setChars[0] | 0x20) == setChars[1] ?
+ $"(({chExpr} | 0x20) == {Literal(setChars[1])})" :
+ $"(((ch = {chExpr}) == {Literal(setChars[0])}) | (ch == {Literal(setChars[1])}))";
- case 3:
- return $"(((ch = {chExpr}) == {Literal(setChars[0])}) | (ch == {Literal(setChars[1])}) | (ch == {Literal(setChars[2])}))";
- }
+ case 3:
+ return (setChars[0] | 0x20) == setChars[1] ?
+ $"((((ch = {chExpr}) | 0x20) == {Literal(setChars[1])}) | (ch == {Literal(setChars[2])}))" :
+ $"(((ch = {chExpr}) == {Literal(setChars[0])}) | (ch == {Literal(setChars[1])}) | (ch == {Literal(setChars[2])}))";
+
+ case 4:
+ if (((setChars[0] | 0x20) == setChars[1]) &&
+ ((setChars[2] | 0x20) == setChars[3]))
+ {
+ return $"(((ch = ({chExpr} | 0x20)) == {Literal(setChars[1])}) | (ch == {Literal(setChars[3])}))";
+ }
+ break;
}
}
RegexCode code;
try
{
- code = RegexWriter.Write(RegexParser.Parse(pattern, regexOptions, culture));
+ code = RegexWriter.Write(RegexParser.Parse(pattern, regexOptions, culture), culture);
}
catch (Exception e)
{
{
internal static class InterlockedExtensions
{
- public static int Or(ref int location1, int value)
+ public static uint Or(ref uint location1, uint value)
{
- int current = location1;
+ uint current = location1;
while (true)
{
- int newValue = current | value;
- int oldValue = Interlocked.CompareExchange(ref location1, newValue, current);
+ uint newValue = current | value;
+ uint oldValue = (uint)Interlocked.CompareExchange(ref Unsafe.As<uint, int>(ref location1), (int)newValue, (int)current);
if (oldValue == current)
{
return oldValue;
<Compile Include="$(CoreLibSharedDir)System\Collections\Generic\ValueListBuilder.cs" Link="Production\ValueListBuilder.cs" />
<Compile Include="..\src\System\Collections\Generic\ValueListBuilder.Pop.cs" Link="Production\ValueListBuilder.Pop.cs" />
<Compile Include="..\src\System\Threading\StackHelper.cs" Link="Production\StackHelper.cs" />
- <Compile Include="..\src\System\Text\RegularExpressions\RegexBoyerMoore.cs" Link="Production\RegexBoyerMoore.cs" />
<Compile Include="..\src\System\Text\RegularExpressions\RegexCharClass.cs" Link="Production\RegexCharClass.cs" />
<Compile Include="..\src\System\Text\RegularExpressions\RegexCharClass.MappingTable.cs" Link="Production\RegexCharClass.MappingTable.cs" />
<Compile Include="..\src\System\Text\RegularExpressions\RegexCode.cs" Link="Production\RegexCode.cs" />
+ <Compile Include="..\src\System\Text\RegularExpressions\RegexFindOptimizations.cs" Link="Production\RegexFindOptimizations.cs" />
<Compile Include="..\src\System\Text\RegularExpressions\RegexNode.cs" Link="Production\RegexNode.cs" />
<Compile Include="..\src\System\Text\RegularExpressions\RegexOptions.cs" Link="Production\RegexOptions.cs" />
<Compile Include="..\src\System\Text\RegularExpressions\RegexParseError.cs" Link="Production\RegexParseError.cs" />
<Compile Include="System\Text\RegularExpressions\Regex.Replace.cs" />
<Compile Include="System\Text\RegularExpressions\Regex.Split.cs" />
<Compile Include="System\Text\RegularExpressions\Regex.Timeout.cs" />
- <Compile Include="System\Text\RegularExpressions\RegexBoyerMoore.cs" />
<Compile Include="System\Text\RegularExpressions\RegexCharClass.cs" />
<Compile Include="System\Text\RegularExpressions\RegexCharClass.MappingTable.cs" />
<Compile Include="System\Text\RegularExpressions\RegexCode.cs" />
<Compile Include="System\Text\RegularExpressions\RegexCompilationInfo.cs" />
+ <Compile Include="System\Text\RegularExpressions\RegexFindOptimizations.cs" />
<Compile Include="System\Text\RegularExpressions\RegexGeneratorAttribute.cs" />
<Compile Include="System\Text\RegularExpressions\RegexInterpreter.cs" />
<Compile Include="System\Text\RegularExpressions\RegexMatchTimeoutException.cs" />
<Reference Include="System.Memory" />
<Reference Include="System.Runtime" />
<Reference Include="System.Runtime.Extensions" />
+ <Reference Include="System.Runtime.InteropServices" />
<Reference Include="System.Threading" />
<!-- References required for RegexOptions.Compiled -->
<Reference Include="System.Reflection.Emit" />
Regex.ValidateOptions(options);
Regex.ValidateMatchTimeout(matchTimeout);
- CultureInfo culture = (options & RegexOptions.CultureInvariant) != 0 ? CultureInfo.InvariantCulture : CultureInfo.CurrentCulture;
+ CultureInfo culture = RegexParser.GetTargetCulture(options);
Key key = new Key(pattern, culture.ToString(), options, matchTimeout);
Regex? regex = Get(key);
// Call Init directly rather than delegating to a Regex ctor that takes
// options to enable linking / tree shaking to remove the Regex compiler
// and NonBacktracking implementation if it's not used.
- Init(pattern, RegexOptions.None, s_defaultMatchTimeout, culture);
+ Init(pattern, RegexOptions.None, s_defaultMatchTimeout, culture ?? CultureInfo.CurrentCulture);
}
internal Regex(string pattern, RegexOptions options, TimeSpan matchTimeout, CultureInfo? culture)
{
- culture ??= GetTargetCulture(options);
+ culture ??= RegexParser.GetTargetCulture(options);
Init(pattern, options, matchTimeout, culture);
if ((options & RegexOptions.NonBacktracking) != 0)
}
}
- /// <summary>Gets the culture to use based on the specified options.</summary>
- private static CultureInfo GetTargetCulture(RegexOptions options) =>
- (options & RegexOptions.CultureInvariant) != 0 ? CultureInfo.InvariantCulture : CultureInfo.CurrentCulture;
-
/// <summary>Initializes the instance.</summary>
/// <remarks>
/// This is separated out of the constructor so that an app only using 'new Regex(pattern)'
/// compiler, such that a tree shaker / linker can trim it away if it's not otherwise used.
/// </remarks>
[MemberNotNull(nameof(_code))]
- private void Init(string pattern, RegexOptions options, TimeSpan matchTimeout, CultureInfo? culture)
+ private void Init(string pattern, RegexOptions options, TimeSpan matchTimeout, CultureInfo culture)
{
ValidatePattern(pattern);
ValidateOptions(options);
this.pattern = pattern;
internalMatchTimeout = matchTimeout;
roptions = options;
- culture ??= GetTargetCulture(options);
#if DEBUG
if (IsDebug)
// Generate the RegexCode from the node tree. This is required for interpreting,
// and is used as input into RegexOptions.Compiled and RegexOptions.NonBacktracking.
- _code = RegexWriter.Write(tree);
+ _code = RegexWriter.Write(tree, culture);
if ((options & RegexOptions.NonBacktracking) != 0)
{
/// <summary>Creates a new runner instance.</summary>
private RegexRunner CreateRunner() =>
factory?.CreateInstance() ??
- new RegexInterpreter(_code!, GetTargetCulture(roptions));
+ new RegexInterpreter(_code!, RegexParser.GetTargetCulture(roptions));
/// <summary>True if the <see cref="RegexOptions.Compiled"/> option was set.</summary>
protected bool UseOptionC() => (roptions & RegexOptions.Compiled) != 0;
+++ /dev/null
-// Licensed to the .NET Foundation under one or more agreements.
-// The .NET Foundation licenses this file to you under the MIT license.
-
-// The RegexBoyerMoore object precomputes the Boyer-Moore
-// tables for fast string scanning. These tables allow
-// you to scan for the first occurrence of a string within
-// a large body of text without examining every character.
-// The performance of the heuristic depends on the actual
-// string and the text being searched, but usually, the longer
-// the string that is being searched for, the fewer characters
-// need to be examined.
-
-using System.Diagnostics;
-using System.Diagnostics.CodeAnalysis;
-using System.Globalization;
-
-namespace System.Text.RegularExpressions
-{
- internal sealed class RegexBoyerMoore
- {
- public readonly int[] Positive;
- public readonly int[] NegativeASCII;
- public readonly int[][]? NegativeUnicode;
- public readonly string Pattern;
- public readonly int LowASCII;
- public readonly int HighASCII;
- public readonly bool RightToLeft;
- public readonly bool CaseInsensitive;
- private readonly CultureInfo _culture;
-
- /// <summary>The maximum prefix string length for which we'll attempt to create a Boyer-Moore table.</summary>
- /// <remarks>This is limited in order to minimize the overhead of constructing a Regex.</remarks>
- public const int MaxLimit = 50_000; // must be <= char.MaxValue for RegexCompiler to compile Boyer-Moore correctly
-
- /// <summary>
- /// Constructs a Boyer-Moore state machine for searching for the string
- /// pattern. The string must not be zero-length.
- /// </summary>
- public RegexBoyerMoore(string pattern, bool caseInsensitive, bool rightToLeft, CultureInfo culture)
- {
- // Sorry, you just can't use Boyer-Moore to find an empty pattern.
- // We're doing this for your own protection. (Really, for speed.)
- Debug.Assert(pattern.Length != 0, "RegexBoyerMoore called with an empty string. This is bad for perf");
- Debug.Assert(pattern.Length <= MaxLimit, "RegexBoyerMoore can take a long time for large patterns");
-#if DEBUG
- if (caseInsensitive)
- {
- foreach (char c in pattern)
- {
- // We expect each individual character to have been lower-cased. We don't validate the whole
- // string at once because the rest of the library doesn't currently recognize/support surrogate pairs.
- Debug.Assert(c == culture.TextInfo.ToLower(c), "Pattern wasn't lowercased with provided culture");
- }
- }
-#endif
-
- Pattern = pattern;
- RightToLeft = rightToLeft;
- CaseInsensitive = caseInsensitive;
- _culture = culture;
-
- int beforefirst;
- int last;
- int bump;
-
- if (!rightToLeft)
- {
- beforefirst = -1;
- last = pattern.Length - 1;
- bump = 1;
- }
- else
- {
- beforefirst = pattern.Length;
- last = 0;
- bump = -1;
- }
-
- // PART I - the good-suffix shift table
- //
- // compute the positive requirement:
- // if char "i" is the first one from the right that doesn't match,
- // then we know the matcher can advance by _positive[i].
- //
- // This algorithm is a simplified variant of the standard
- // Boyer-Moore good suffix calculation.
-
- Positive = new int[pattern.Length];
-
- int examine = last;
- char ch = pattern[examine];
- Positive[examine] = bump;
- examine -= bump;
- int scan;
- int match;
-
- while (true)
- {
- // find an internal char (examine) that matches the tail
-
- while (true)
- {
- if (examine == beforefirst)
- goto OuterloopBreak;
- if (pattern[examine] == ch)
- break;
- examine -= bump;
- }
-
- match = last;
- scan = examine;
-
- // find the length of the match
-
- while (true)
- {
- if (scan == beforefirst || pattern[match] != pattern[scan])
- {
- // at the end of the match, note the difference in _positive
- // this is not the length of the match, but the distance from the internal match
- // to the tail suffix.
- if (Positive[match] == 0)
- Positive[match] = match - scan;
-
- break;
- }
-
- scan -= bump;
- match -= bump;
- }
-
- examine -= bump;
- }
-
- OuterloopBreak:
-
- match = last - bump;
-
- // scan for the chars for which there are no shifts that yield a different candidate
-
-
- // The inside of the if statement used to say
- // "_positive[match] = last - beforefirst;"
- // This is slightly less aggressive in how much we skip, but at worst it
- // should mean a little more work rather than skipping a potential match.
- while (match != beforefirst)
- {
- if (Positive[match] == 0)
- Positive[match] = bump;
-
- match -= bump;
- }
-
- // PART II - the bad-character shift table
- //
- // compute the negative requirement:
- // if char "ch" is the reject character when testing position "i",
- // we can slide up by _negative[ch];
- // (_negative[ch] = str.Length - 1 - str.LastIndexOf(ch))
- //
- // the lookup table is divided into ASCII and Unicode portions;
- // only those parts of the Unicode 16-bit code set that actually
- // appear in the string are in the table. (Maximum size with
- // Unicode is 65K; ASCII only case is 512 bytes.)
-
- NegativeASCII = new int[128];
-
- for (int i = 0; i < 128; i++)
- NegativeASCII[i] = last - beforefirst;
-
- LowASCII = 127;
- HighASCII = 0;
-
- for (examine = last; examine != beforefirst; examine -= bump)
- {
- ch = pattern[examine];
-
- if (ch < 128)
- {
- if (LowASCII > ch)
- LowASCII = ch;
-
- if (HighASCII < ch)
- HighASCII = ch;
-
- if (NegativeASCII[ch] == last - beforefirst)
- NegativeASCII[ch] = last - examine;
- }
- else
- {
- int i = ch >> 8;
- int j = ch & 0xFF;
-
- if (NegativeUnicode == null)
- {
- NegativeUnicode = new int[256][];
- }
-
- if (NegativeUnicode[i] == null)
- {
- int[] newarray = new int[256];
-
- for (int k = 0; k < newarray.Length; k++)
- newarray[k] = last - beforefirst;
-
- if (i == 0)
- {
- Array.Copy(NegativeASCII, newarray, 128);
- NegativeASCII = newarray;
- }
-
- NegativeUnicode[i] = newarray;
- }
-
- if (NegativeUnicode[i][j] == last - beforefirst)
- NegativeUnicode[i][j] = last - examine;
- }
- }
- }
-
- // TODO: We should be able to avoid producing the RegexBoyerMoore instance
- // entirely if we're going to go down the code path of using IndexOf. That will
- // require some refactoring, though.
-
- /// <summary>Gets whether IndexOf could be used to perform the match.</summary>
- public bool PatternSupportsIndexOf =>
- !RightToLeft && (!CaseInsensitive || !RegexCharClass.ParticipatesInCaseConversion(Pattern));
-
- /// <summary>
- /// When a regex is anchored, we can do a quick IsMatch test instead of a Scan
- /// </summary>
- public bool IsMatch(string text, int index, int beglimit, int endlimit)
- {
- if (!RightToLeft)
- {
- if (index < beglimit || endlimit - index < Pattern.Length)
- return false;
- }
- else
- {
- if (index > endlimit || index - beglimit < Pattern.Length)
- return false;
-
- index -= Pattern.Length;
- }
-
- if (CaseInsensitive)
- {
- TextInfo textinfo = _culture.TextInfo;
-
- for (int i = 0; i < Pattern.Length; i++)
- {
- if (Pattern[i] != textinfo.ToLower(text[index + i]))
- {
- return false;
- }
- }
-
- return true;
- }
-
- return Pattern.AsSpan().SequenceEqual(text.AsSpan(index, Pattern.Length));
- }
-
- /// <summary>
- /// Scan uses the Boyer-Moore algorithm to find the first occurrence
- /// of the specified string within text, beginning at index, and
- /// constrained within beglimit and endlimit.
- ///
- /// The direction and case-sensitivity of the match is determined
- /// by the arguments to the RegexBoyerMoore constructor.
- /// </summary>
- public int Scan(string text, int index, int beglimit, int endlimit)
- {
- int defadv;
- int test;
- int startmatch;
- int endmatch;
- int bump;
-
- if (!RightToLeft)
- {
- defadv = Pattern.Length;
- startmatch = Pattern.Length - 1;
- endmatch = 0;
- test = index + defadv - 1;
- bump = 1;
- }
- else
- {
- defadv = -Pattern.Length;
- startmatch = 0;
- endmatch = -defadv - 1;
- test = index + defadv;
- bump = -1;
- }
-
- char chMatch = Pattern[startmatch];
- char chTest;
- int test2;
- int match;
- int advance;
- int[] unicodeLookup;
-
- while (true)
- {
- if (test >= endlimit || test < beglimit)
- return -1;
-
- chTest = text[test];
-
- if (CaseInsensitive)
- chTest = _culture.TextInfo.ToLower(chTest);
-
- if (chTest != chMatch)
- {
- if (chTest < 128)
- advance = NegativeASCII[chTest];
- else if (null != NegativeUnicode && (null != (unicodeLookup = NegativeUnicode[chTest >> 8])))
- advance = unicodeLookup[chTest & 0xFF];
- else
- advance = defadv;
-
- test += advance;
- }
- else
- { // if (chTest == chMatch)
- test2 = test;
- match = startmatch;
-
- while (true)
- {
- if (match == endmatch)
- return (RightToLeft ? test2 + 1 : test2);
-
- match -= bump;
- test2 -= bump;
-
- chTest = text[test2];
-
- if (CaseInsensitive)
- chTest = _culture.TextInfo.ToLower(chTest);
-
- if (chTest != Pattern[match])
- {
- advance = Positive[match];
- if ((chTest & 0xFF80) == 0)
- test2 = (match - startmatch) + NegativeASCII[chTest];
- else if (null != NegativeUnicode && (null != (unicodeLookup = NegativeUnicode[chTest >> 8])))
- test2 = (match - startmatch) + unicodeLookup[chTest & 0xFF];
- else
- {
- test += advance;
- break;
- }
-
- if (RightToLeft ? test2 < advance : test2 > advance)
- advance = test2;
-
- test += advance;
- break;
- }
- }
- }
- }
- }
-
-#if DEBUG
- /// <summary>Used when dumping for debugging.</summary>
- [ExcludeFromCodeCoverage]
- public override string ToString() => Dump(string.Empty);
-
- [ExcludeFromCodeCoverage]
- public string Dump(string indent)
- {
- var sb = new StringBuilder();
-
- sb.AppendLine($"{indent}BM Pattern: {Pattern}");
-
- sb.Append($"{indent}Positive: ");
- foreach (int i in Positive)
- {
- sb.Append($"{i} ");
- }
- sb.AppendLine();
-
- if (NegativeASCII != null)
- {
- sb.Append($"{indent}Negative table: ");
- for (int i = 0; i < NegativeASCII.Length; i++)
- {
- if (NegativeASCII[i] != Pattern.Length)
- {
- sb.Append($" {{{Regex.Escape(((char)i).ToString())} {NegativeASCII[i]}}}");
- }
- }
- }
- sb.AppendLine();
-
- return sb.ToString();
- }
-#endif
- }
-}
using System.Diagnostics;
using System.Diagnostics.CodeAnalysis;
using System.Globalization;
+using System.Runtime.CompilerServices;
using System.Threading;
namespace System.Text.RegularExpressions
internal const string NotECMADigitClass = "\x01\x02\x00" + ECMADigitSet;
internal const string AnyClass = "\x00\x01\x00\x00";
+ private const string EmptyClass = "\x00\x00\x00";
// UnicodeCategory is zero based, so we add one to each value and subtract it off later
private const int DefinedCategoriesCapacity = 38;
}
}
- /// <summary>Gets whether the specified string participates in case conversion.</summary>
- /// <remarks>The string participates in case conversion if any of its characters do.</remarks>
- public static bool ParticipatesInCaseConversion(string s)
+ /// <summary>Gets whether the specified span participates in case conversion.</summary>
+ /// <remarks>The span participates in case conversion if any of its characters do.</remarks>
+ public static bool ParticipatesInCaseConversion(ReadOnlySpan<char> s)
{
foreach (char c in s)
{
}
/// <summary>Gets whether we can iterate through the set list pairs in order to completely enumerate the set's contents.</summary>
+ /// <remarks>This may enumerate negated characters if the set is negated.</remarks>
private static bool CanEasilyEnumerateSetContents(string set) =>
set.Length > SetStartIndex &&
set[SetLengthIndex] > 0 &&
}
}
- public static bool CharInClass(char ch, string set, ref int[]? asciiResultCache)
+ /// <summary>Determines a character's membership in a character class (via the string representation of the class).</summary>
+ /// <param name="ch">The character.</param>
+ /// <param name="set">The string representation of the character class.</param>
+ /// <param name="asciiLazyCache">A lazily-populated cache for ASCII results stored in a 256-bit array.</param>
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public static bool CharInClass(char ch, string set, ref uint[]? asciiLazyCache)
{
- // The int[] contains 8 ints, or 256 bits. These are laid out as pairs, where the first bit ("known") in the pair
- // says whether the second bit ("value") in the pair has already been computed. Once a value is computed, it's never
+ // The uint[] contains 8 ints, or 256 bits. These are laid out as pairs, where the first bit in the pair
+ // says whether the second bit in the pair has already been computed. Once a value is computed, it's never
// changed, so since Int32s are written/read atomically, we can trust the value bit if we see that the known bit
// has been set. If the known bit hasn't been set, then we proceed to look it up, and then swap in the result.
const int CacheArrayLength = 8;
- Debug.Assert(asciiResultCache is null || asciiResultCache.Length == CacheArrayLength, "set lookup should be able to store two bits for each of the first 128 characters");
+ Debug.Assert(asciiLazyCache is null || asciiLazyCache.Length == CacheArrayLength, "set lookup should be able to store two bits for each of the first 128 characters");
- if (ch < 128)
+ // If the value is ASCII and already has an answer for this value, use it.
+ if (asciiLazyCache is uint[] cache)
{
- // Lazily-initialize the cache for this set.
- if (asciiResultCache is null)
+ int index = ch >> 4;
+ if ((uint)index < (uint)cache.Length)
{
- Interlocked.CompareExchange(ref asciiResultCache, new int[CacheArrayLength], null);
+ Debug.Assert(ch < 128);
+ uint current = cache[index];
+ uint bit = 1u << ((ch & 0xF) << 1);
+ if ((current & bit) != 0)
+ {
+ return (current & (bit << 1)) != 0;
+ }
}
+ }
- // Determine which int in the lookup array contains the known and value bits for this character,
- // and compute their bit numbers.
- ref int slot = ref asciiResultCache[ch >> 4];
- int knownBit = 1 << ((ch & 0xF) << 1);
- int valueBit = knownBit << 1;
-
- // If the value for this bit has already been computed, use it.
- int current = slot;
- if ((current & knownBit) != 0)
- {
- return (current & valueBit) != 0;
- }
+ // For ASCII, lazily initialize. For non-ASCII, just compute the value.
+ return ch < 128 ?
+ InitializeValue(ch, set, ref asciiLazyCache) :
+ CharInClassRecursive(ch, set, 0);
+ static bool InitializeValue(char ch, string set, ref uint[]? asciiLazyCache)
+ {
// (After warm-up, we should find ourselves rarely getting here.)
+ Debug.Assert(ch < 128);
- // Otherwise, compute it normally.
+ // Compute the result and determine which bits to write back to the array and "or" the bits back in a thread-safe manner.
bool isInClass = CharInClass(ch, set);
-
- // Determine which bits to write back to the array and "or" the bits back in a thread-safe manner.
- int bitsToSet = knownBit;
+ uint bitsToSet = 1u << ((ch & 0xF) << 1);
if (isInClass)
{
- bitsToSet |= valueBit;
+ bitsToSet |= bitsToSet << 1;
}
+
+ uint[]? cache = asciiLazyCache ?? Interlocked.CompareExchange(ref asciiLazyCache, new uint[CacheArrayLength], null) ?? asciiLazyCache;
#if REGEXGENERATOR
- InterlockedExtensions.Or(ref slot, bitsToSet);
+ InterlockedExtensions.Or(ref cache[ch >> 4], bitsToSet);
#else
- Interlocked.Or(ref slot, bitsToSet);
+ Interlocked.Or(ref cache[ch >> 4], bitsToSet);
#endif
// Return the computed value.
return isInClass;
}
-
- // Non-ASCII. Fall back to computing the answer.
- return CharInClassRecursive(ch, set, 0);
}
+ /// <summary>
+ /// Determines a character's membership in a character class (via the string representation of the class).
+ /// </summary>
public static bool CharInClass(char ch, string set) =>
CharInClassRecursive(ch, set, 0);
return new RegexCharClass(IsNegated(charClass, start), ranges, categoriesBuilder, sub);
}
+ #region Perf workaround until https://github.com/dotnet/runtime/issues/61048 and https://github.com/dotnet/runtime/issues/59492 are addressed
+ // TODO: https://github.com/dotnet/runtime/issues/61048
+ // The below functionality needs to be removed/replaced/generalized. The goal is to avoid relying on
+ // ToLower and culture-based operation at match time, and instead be able to compute at construction
+ // time case folding equivalence classes that let us determine up-front the set of characters considered
+ // valid for a match. For now, we do this just for ASCII, and for anything else fall back to the
+ // pre-existing mechanism whereby a culture is used at construction time to ToLower and then one is
+ // used at match time to ToLower. We also skip 'i' and 'I', as the casing of those varies across culture
+ // whereas every other ASCII value's casing is stable across culture. We could hardcode the values for
+ // when an invariant vs tr/az culture vs any other culture is used, and we likely will, but for now doing
+ // so would be a breaking change, as in doing so we'd be relying only on the culture present at the time
+ // of construction rather than the one at the time of match. That will be resolved with
+ // https://github.com/dotnet/runtime/issues/59492.
+
+ /// <summary>Creates a set string for a single character, optionally factoring in case-insensitivity.</summary>
+ /// <param name="c">The character for which to create the set.</param>
+ /// <param name="caseInsensitive">null if case-sensitive; non-null if case-insensitive, in which case it's the culture to use.</param>
+ /// <param name="resultIsCaseInsensitive">false if the caller should strip out RegexOptions.IgnoreCase because it's now fully represented by the set; otherwise, true.</param>
+ /// <returns>The create set string.</returns>
+ public static string OneToStringClass(char c, CultureInfo? caseInsensitive, out bool resultIsCaseInsensitive)
+ {
+ var vsb = new ValueStringBuilder(stackalloc char[4]);
+
+ if (caseInsensitive is null)
+ {
+ resultIsCaseInsensitive = false;
+ vsb.Append(c);
+ }
+ else if (c < 128 && (c | 0x20) != 'i')
+ {
+ resultIsCaseInsensitive = false;
+ switch (c)
+ {
+ // These are the same in all cultures. As with the rest of this support, we can generalize this
+ // once we fix the aforementioned casing issues, e.g. by lazily populating an interning cache
+ // rather than hardcoding the strings for these values, once almost all values will be the same
+ // regardless of culture.
+ case 'A': case 'a': return "\0\x0004\0ABab";
+ case 'B': case 'b': return "\0\x0004\0BCbc";
+ case 'C': case 'c': return "\0\x0004\0CDcd";
+ case 'D': case 'd': return "\0\x0004\0DEde";
+ case 'E': case 'e': return "\0\x0004\0EFef";
+ case 'F': case 'f': return "\0\x0004\0FGfg";
+ case 'G': case 'g': return "\0\x0004\0GHgh";
+ case 'H': case 'h': return "\0\x0004\0HIhi";
+ // allow 'i' to fall through
+ case 'J': case 'j': return "\0\x0004\0JKjk";
+ case 'K': case 'k': return "\0\x0006\0KLkl\u212A\u212B";
+ case 'L': case 'l': return "\0\x0004\0LMlm";
+ case 'M': case 'm': return "\0\x0004\0MNmn";
+ case 'N': case 'n': return "\0\x0004\0NOno";
+ case 'O': case 'o': return "\0\x0004\0OPop";
+ case 'P': case 'p': return "\0\x0004\0PQpq";
+ case 'Q': case 'q': return "\0\x0004\0QRqr";
+ case 'R': case 'r': return "\0\x0004\0RSrs";
+ case 'S': case 's': return "\0\x0004\0STst";
+ case 'T': case 't': return "\0\x0004\0TUtu";
+ case 'U': case 'u': return "\0\x0004\0UVuv";
+ case 'V': case 'v': return "\0\x0004\0VWvw";
+ case 'W': case 'w': return "\0\x0004\0WXwx";
+ case 'X': case 'x': return "\0\x0004\0XYxy";
+ case 'Y': case 'y': return "\0\x0004\0YZyz";
+ case 'Z': case 'z': return "\0\x0004\0Z[z{";
+
+ // All the ASCII !ParticipatesInCaseConversion
+ case '\u0000': return "\0\u0002\0\u0000\u0001";
+ case '\u0001': return "\0\u0002\0\u0001\u0002";
+ case '\u0002': return "\0\u0002\0\u0002\u0003";
+ case '\u0003': return "\0\u0002\0\u0003\u0004";
+ case '\u0004': return "\0\u0002\0\u0004\u0005";
+ case '\u0005': return "\0\u0002\0\u0005\u0006";
+ case '\u0006': return "\0\u0002\0\u0006\u0007";
+ case '\u0007': return "\0\u0002\0\u0007\u0008";
+ case '\u0008': return "\0\u0002\0\u0008\u0009";
+ case '\u0009': return "\0\u0002\0\u0009\u000A";
+ case '\u000A': return "\0\u0002\0\u000A\u000B";
+ case '\u000B': return "\0\u0002\0\u000B\u000C";
+ case '\u000C': return "\0\u0002\0\u000C\u000D";
+ case '\u000D': return "\0\u0002\0\u000D\u000E";
+ case '\u000E': return "\0\u0002\0\u000E\u000F";
+ case '\u000F': return "\0\u0002\0\u000F\u0010";
+ case '\u0010': return "\0\u0002\0\u0010\u0011";
+ case '\u0011': return "\0\u0002\0\u0011\u0012";
+ case '\u0012': return "\0\u0002\0\u0012\u0013";
+ case '\u0013': return "\0\u0002\0\u0013\u0014";
+ case '\u0014': return "\0\u0002\0\u0014\u0015";
+ case '\u0015': return "\0\u0002\0\u0015\u0016";
+ case '\u0016': return "\0\u0002\0\u0016\u0017";
+ case '\u0017': return "\0\u0002\0\u0017\u0018";
+ case '\u0018': return "\0\u0002\0\u0018\u0019";
+ case '\u0019': return "\0\u0002\0\u0019\u001A";
+ case '\u001A': return "\0\u0002\0\u001A\u001B";
+ case '\u001B': return "\0\u0002\0\u001B\u001C";
+ case '\u001C': return "\0\u0002\0\u001C\u001D";
+ case '\u001D': return "\0\u0002\0\u001D\u001E";
+ case '\u001E': return "\0\u0002\0\u001E\u001F";
+ case '\u001F': return "\0\u0002\0\u001F\u0020";
+ case '\u0020': return "\0\u0002\0\u0020\u0021";
+ case '\u0021': return "\0\u0002\0\u0021\u0022";
+ case '\u0022': return "\0\u0002\0\u0022\u0023";
+ case '\u0023': return "\0\u0002\0\u0023\u0024";
+ case '\u0025': return "\0\u0002\0\u0025\u0026";
+ case '\u0026': return "\0\u0002\0\u0026\u0027";
+ case '\u0027': return "\0\u0002\0\u0027\u0028";
+ case '\u0028': return "\0\u0002\0\u0028\u0029";
+ case '\u0029': return "\0\u0002\0\u0029\u002A";
+ case '\u002A': return "\0\u0002\0\u002A\u002B";
+ case '\u002C': return "\0\u0002\0\u002C\u002D";
+ case '\u002D': return "\0\u0002\0\u002D\u002E";
+ case '\u002E': return "\0\u0002\0\u002E\u002F";
+ case '\u002F': return "\0\u0002\0\u002F\u0030";
+ case '\u0030': return "\0\u0002\0\u0030\u0031";
+ case '\u0031': return "\0\u0002\0\u0031\u0032";
+ case '\u0032': return "\0\u0002\0\u0032\u0033";
+ case '\u0033': return "\0\u0002\0\u0033\u0034";
+ case '\u0034': return "\0\u0002\0\u0034\u0035";
+ case '\u0035': return "\0\u0002\0\u0035\u0036";
+ case '\u0036': return "\0\u0002\0\u0036\u0037";
+ case '\u0037': return "\0\u0002\0\u0037\u0038";
+ case '\u0038': return "\0\u0002\0\u0038\u0039";
+ case '\u0039': return "\0\u0002\0\u0039\u003A";
+ case '\u003A': return "\0\u0002\0\u003A\u003B";
+ case '\u003B': return "\0\u0002\0\u003B\u003C";
+ case '\u003F': return "\0\u0002\0\u003F\u0040";
+ case '\u0040': return "\0\u0002\0\u0040\u0041";
+ case '\u005B': return "\0\u0002\0\u005B\u005C";
+ case '\u005C': return "\0\u0002\0\u005C\u005D";
+ case '\u005D': return "\0\u0002\0\u005D\u005E";
+ case '\u005F': return "\0\u0002\0\u005F\u0060";
+ case '\u007B': return "\0\u0002\0\u007B\u007C";
+ case '\u007D': return "\0\u0002\0\u007D\u007E";
+ case '\u007F': return "\0\u0002\0\u007F\u0080";
+ }
+ AddAsciiCharIgnoreCaseEquivalence(c, ref vsb, caseInsensitive);
+ }
+ else if (!ParticipatesInCaseConversion(c))
+ {
+ resultIsCaseInsensitive = false;
+ vsb.Append(c);
+ }
+ else
+ {
+ resultIsCaseInsensitive = true;
+ vsb.Append(char.ToLower(c, caseInsensitive));
+ }
+
+ string result = CharsToStringClass(vsb.AsSpan());
+ vsb.Dispose();
+ return result;
+ }
+
+ private static unsafe string CharsToStringClass(ReadOnlySpan<char> chars)
+ {
+#if DEBUG
+ // Make sure they're all sorted with no duplicates
+ for (int index = 0; index < chars.Length - 1; index++)
+ {
+ Debug.Assert(chars[index] < chars[index + 1]);
+ }
+#endif
+
+ // If there aren't any chars, just return an empty class.
+ if (chars.Length == 0)
+ {
+ return EmptyClass;
+ }
+
+ // Count how many characters there actually are. All but the very last possible
+ // char value will have two characters, one for the inclusive beginning of range
+ // and one for the exclusive end of range.
+ int count = chars.Length * 2;
+ if (chars[chars.Length - 1] == LastChar)
+ {
+ count--;
+ }
+
+ // Get the pointer/length of the span to be able to pass it into string.Create.
+ fixed (char* charsPtr = chars)
+ {
+#if REGEXGENERATOR
+ return StringExtensions.Create(
+#else
+ return string.Create(
+#endif
+ SetStartIndex + count, ((IntPtr)charsPtr, chars.Length), static (span, state) =>
+ {
+ // Reconstruct the span now that we're inside of the lambda.
+ ReadOnlySpan<char> chars = new ReadOnlySpan<char>((char*)state.Item1, state.Length);
+
+ // Fill in the set string
+ span[FlagsIndex] = (char)0;
+ span[CategoryLengthIndex] = (char)0;
+ span[SetLengthIndex] = (char)(span.Length - SetStartIndex);
+ int i = SetStartIndex;
+ foreach (char c in chars)
+ {
+ span[i++] = c;
+ if (c != LastChar)
+ {
+ span[i++] = (char)(c + 1);
+ }
+ }
+ Debug.Assert(i == span.Length);
+ });
+ }
+ }
+
+ /// <summary>Tries to create from a RegexOptions.IgnoreCase set string a new set string that can be used without RegexOptions.IgnoreCase.</summary>
+ /// <param name="set">The original set string from a RegexOptions.IgnoreCase node.</param>
+ /// <param name="culture">The culture in use.</param>
+ /// <returns>A new set string if one could be created.</returns>
+ public static string? MakeCaseSensitiveIfPossible(string set, CultureInfo culture)
+ {
+ if (IsNegated(set))
+ {
+ return null;
+ }
+
+ // We'll eventually need a more robust way to do this for any set. For now, we iterate through each character
+ // in the set, and to avoid spending lots of time doing so, we limit the number of characters. This approach also
+ // limits the structure of the sets allowed, e.g. they can't be negated, can't use subtraction, etc.
+ Span<char> setChars = stackalloc char[64]; // arbitary limit chosen to include common groupings like all ASCII letters and digits
+
+ // Try to get the set's characters.
+ int setCharsCount = GetSetChars(set, setChars);
+ if (setCharsCount == 0)
+ {
+ return null;
+ }
+
+ // Enumerate all the characters and add all characters that form their case folding equivalence class.
+ var rcc = new RegexCharClass();
+ var vsb = new ValueStringBuilder(stackalloc char[4]);
+ foreach (char c in setChars.Slice(0, setCharsCount))
+ {
+ if (c >= 128 || c == 'i' || c == 'I')
+ {
+ return null;
+ }
+
+ vsb.Length = 0;
+ AddAsciiCharIgnoreCaseEquivalence(c, ref vsb, culture);
+ foreach (char v in vsb.AsSpan())
+ {
+ rcc.AddChar(v);
+ }
+ }
+
+ // Return the constructed class.
+ return rcc.ToStringClass();
+ }
+
+ private static void AddAsciiCharIgnoreCaseEquivalence(char c, ref ValueStringBuilder vsb, CultureInfo culture)
+ {
+ Debug.Assert(c < 128, $"Expected ASCII, got {(int)c}");
+ Debug.Assert(c != 'i' && c != 'I', "'i' currently doesn't work correctly in all cultures");
+
+ char upper = char.ToUpper(c, culture);
+ char lower = char.ToLower(c, culture);
+
+ if (upper < lower)
+ {
+ vsb.Append(upper);
+ }
+ vsb.Append(lower);
+ if (upper > lower)
+ {
+ vsb.Append(upper);
+ }
+
+ if (c == 'k' || c == 'K')
+ {
+ vsb.Append((char)0x212A); // kelvin sign
+ }
+ }
+ #endregion
+
/// <summary>
/// Constructs the string representation of the class.
/// </summary>
using System.Collections;
using System.Diagnostics;
using System.Diagnostics.CodeAnalysis;
+using System.Globalization;
namespace System.Text.RegularExpressions
{
public readonly RegexTree Tree; // the optimized parse tree
public readonly int[] Codes; // the code
public readonly string[] Strings; // the string/set table
- public readonly int[]?[] StringsAsciiLookup; // the ASCII lookup table optimization for the sets in Strings
+ public readonly uint[]?[] StringsAsciiLookup; // the ASCII lookup table optimization for the sets in Strings
public readonly int TrackCount; // how many instructions use backtracking
public readonly Hashtable? Caps; // mapping of user group numbers -> impl group slots
public readonly int CapSize; // number of impl group slots
- public readonly (string CharClass, bool CaseInsensitive)[]? LeadingCharClasses; // the set of candidate first characters, if available. Each entry corresponds to the next char in the input.
- public int[]? LeadingCharClassAsciiLookup; // the ASCII lookup table optimization for LeadingCharClasses[0], if it exists; only used by the interpreter
- public readonly RegexBoyerMoore? BoyerMoorePrefix; // the fixed prefix string as a Boyer-Moore machine, if available
- public readonly int LeadingAnchor; // the leading anchor, if one exists (RegexPrefixAnalyzer.Bol, etc)
public readonly bool RightToLeft; // true if right to left
+ public readonly RegexFindOptimizations FindOptimizations;
- public RegexCode(RegexTree tree, int[] codes, string[] strings, int trackcount,
- Hashtable? caps, int capsize,
- RegexBoyerMoore? boyerMoorePrefix,
- (string CharClass, bool CaseInsensitive)[]? leadingCharClasses,
- int leadingAnchor, bool rightToLeft)
+ public RegexCode(RegexTree tree, CultureInfo culture, int[] codes, string[] strings, int trackcount,
+ Hashtable? caps, int capsize)
{
- Debug.Assert(boyerMoorePrefix is null || leadingCharClasses is null);
-
Tree = tree;
Codes = codes;
Strings = strings;
- StringsAsciiLookup = new int[strings.Length][];
+ StringsAsciiLookup = new uint[strings.Length][];
TrackCount = trackcount;
Caps = caps;
CapSize = capsize;
- BoyerMoorePrefix = boyerMoorePrefix;
- LeadingCharClasses = leadingCharClasses;
- LeadingAnchor = leadingAnchor;
- RightToLeft = rightToLeft;
+ RightToLeft = (tree.Options & RegexOptions.RightToLeft) != 0;
+ FindOptimizations = new RegexFindOptimizations(tree, culture);
}
public static bool OpcodeBacktracks(int Op)
var sb = new StringBuilder();
sb.AppendLine($"Direction: {(RightToLeft ? "right-to-left" : "left-to-right")}");
- sb.AppendLine($"Anchor: {RegexPrefixAnalyzer.AnchorDescription(LeadingAnchor)}");
+ sb.AppendLine($"Anchor: {RegexPrefixAnalyzer.AnchorDescription(FindOptimizations.LeadingAnchor)}");
sb.AppendLine();
-
- if (BoyerMoorePrefix != null)
- {
- sb.AppendLine("Boyer-Moore:");
- sb.AppendLine(BoyerMoorePrefix.Dump(" "));
- sb.AppendLine();
- }
-
- if (LeadingCharClasses != null)
- {
- sb.AppendLine("First Chars:");
- for (int i = 0; i < LeadingCharClasses.Length; i++)
- {
- sb.AppendLine($"{i}: {RegexCharClass.SetDescription(LeadingCharClasses[i].CharClass)}");
- }
- sb.AppendLine();
- }
-
for (int i = 0; i < Codes.Length; i += OpcodeSize(Codes[i]))
{
sb.AppendLine(OpcodeDescription(i));
private static readonly MethodInfo s_spanIndexOfSpan = typeof(MemoryExtensions).GetMethod("IndexOf", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)) })!.MakeGenericMethod(typeof(char));
private static readonly MethodInfo s_spanIndexOfAnyCharChar = typeof(MemoryExtensions).GetMethod("IndexOfAny", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), Type.MakeGenericMethodParameter(0), Type.MakeGenericMethodParameter(0) })!.MakeGenericMethod(typeof(char));
private static readonly MethodInfo s_spanIndexOfAnyCharCharChar = typeof(MemoryExtensions).GetMethod("IndexOfAny", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), Type.MakeGenericMethodParameter(0), Type.MakeGenericMethodParameter(0), Type.MakeGenericMethodParameter(0) })!.MakeGenericMethod(typeof(char));
+ private static readonly MethodInfo s_spanIndexOfAnySpan = typeof(MemoryExtensions).GetMethod("IndexOfAny", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)) })!.MakeGenericMethod(typeof(char));
+ private static readonly MethodInfo s_spanLastIndexOfChar = typeof(MemoryExtensions).GetMethod("LastIndexOf", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), Type.MakeGenericMethodParameter(0) })!.MakeGenericMethod(typeof(char));
+ private static readonly MethodInfo s_spanLastIndexOfSpan = typeof(MemoryExtensions).GetMethod("LastIndexOf", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)) })!.MakeGenericMethod(typeof(char));
private static readonly MethodInfo s_spanSliceIntMethod = typeof(ReadOnlySpan<char>).GetMethod("Slice", new Type[] { typeof(int) })!;
private static readonly MethodInfo s_spanSliceIntIntMethod = typeof(ReadOnlySpan<char>).GetMethod("Slice", new Type[] { typeof(int), typeof(int) })!;
private static readonly MethodInfo s_spanStartsWith = typeof(MemoryExtensions).GetMethod("StartsWith", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)) })!.MakeGenericMethod(typeof(char));
protected RegexCode? _code; // the RegexCode object
protected int[]? _codes; // the RegexCodes being translated
protected string[]? _strings; // the stringtable associated with the RegexCodes
- protected (string CharClass, bool CaseInsensitive)[]? _leadingCharClasses; // the possible first chars computed by RegexPrefixAnalyzer
- protected RegexBoyerMoore? _boyerMoorePrefix; // a prefix as a boyer-moore machine
- protected int _leadingAnchor; // the set of anchors
protected bool _hasTimeout; // whether the regex has a non-infinite timeout
private Label[]? _labels; // a label for every operation in _codes
}
_runtextLocal = DeclareString();
_textInfoLocal = null;
- if (!_options.HasFlag(RegexOptions.CultureInvariant))
+ if ((_options & RegexOptions.CultureInvariant) == 0)
{
- bool needsCulture = _options.HasFlag(RegexOptions.IgnoreCase) || _boyerMoorePrefix?.CaseInsensitive == true;
- if (!needsCulture && _leadingCharClasses != null)
+ bool needsCulture = _code.FindOptimizations.FindMode switch
{
- for (int i = 0; i < _leadingCharClasses.Length; i++)
- {
- if (_leadingCharClasses[i].CaseInsensitive)
- {
- needsCulture = true;
- break;
- }
- }
- }
+ FindNextStartingPositionMode.FixedLiteral_LeftToRight_CaseInsensitive or
+ FindNextStartingPositionMode.LeadingLiteral_RightToLeft_CaseInsensitive or
+ FindNextStartingPositionMode.FixedSets_LeftToRight_CaseInsensitive or
+ FindNextStartingPositionMode.LeadingSet_LeftToRight_CaseInsensitive or
+ FindNextStartingPositionMode.LeadingSet_RightToLeft_CaseInsensitive => true,
+
+ _ when _code.FindOptimizations.FixedDistanceSets is List<(char[]? Chars, string Set, int Distance, bool CaseInsensitive)> sets => sets.Exists(set => set.CaseInsensitive),
+
+ _ => false,
+ };
if (needsCulture)
{
Ret();
MarkLabel(finishedLengthCheck);
- GenerateAnchorChecks();
-
- if (_boyerMoorePrefix is RegexBoyerMoore { NegativeUnicode: null } rbm)
- {
- if (rbm.PatternSupportsIndexOf)
- {
- GenerateIndexOf(rbm.Pattern);
- }
- else
- {
- GenerateBoyerMoore(rbm);
- }
- }
- else if (_leadingCharClasses is not null)
+ // Emit any anchors.
+ if (GenerateAnchors())
{
- if (_code.RightToLeft)
- {
- GenerateLeadingCharacter_RightToLeft();
- }
- else
- {
- GenerateLeadingCharacter_LeftToRight();
- }
+ return;
}
- else
+
+ // Either anchors weren't specified, or they don't completely root all matches to a specific location.
+
+ switch (_code.FindOptimizations.FindMode)
{
- // return true;
- Ldc(1);
- Ret();
+ case FindNextStartingPositionMode.LeadingPrefix_LeftToRight_CaseSensitive:
+ Debug.Assert(!string.IsNullOrEmpty(_code.FindOptimizations.LeadingCaseSensitivePrefix));
+ GenerateIndexOf_LeftToRight(_code.FindOptimizations.LeadingCaseSensitivePrefix);
+ break;
+
+ case FindNextStartingPositionMode.LeadingPrefix_RightToLeft_CaseSensitive:
+ Debug.Assert(!string.IsNullOrEmpty(_code.FindOptimizations.LeadingCaseSensitivePrefix));
+ GenerateIndexOf_RightToLeft(_code.FindOptimizations.LeadingCaseSensitivePrefix);
+ break;
+
+ case FindNextStartingPositionMode.LeadingSet_LeftToRight_CaseSensitive:
+ case FindNextStartingPositionMode.LeadingSet_LeftToRight_CaseInsensitive:
+ case FindNextStartingPositionMode.FixedSets_LeftToRight_CaseSensitive:
+ case FindNextStartingPositionMode.FixedSets_LeftToRight_CaseInsensitive:
+ Debug.Assert(_code.FindOptimizations.FixedDistanceSets is { Count: > 0 });
+ GenerateFixedSet_LeftToRight();
+ break;
+
+ case FindNextStartingPositionMode.LeadingSet_RightToLeft_CaseSensitive:
+ case FindNextStartingPositionMode.LeadingSet_RightToLeft_CaseInsensitive:
+ Debug.Assert(_code.FindOptimizations.FixedDistanceSets is { Count: > 0 });
+ GenerateFixedSet_RightToLeft();
+ break;
+
+ default:
+ Debug.Fail($"Unexpected mode: {_code.FindOptimizations.FindMode}");
+ goto case FindNextStartingPositionMode.NoSearch;
+
+ case FindNextStartingPositionMode.NoSearch:
+ // return true;
+ Ldc(1);
+ Ret();
+ break;
}
- void GenerateAnchorChecks()
+ // Emits any anchors. Returns true if the anchor roots any match to a specific location and thus no further
+ // searching is required; otherwise, false.
+ bool GenerateAnchors()
{
// Generate anchor checks.
- if ((_leadingAnchor & (RegexPrefixAnalyzer.Beginning | RegexPrefixAnalyzer.Start | RegexPrefixAnalyzer.EndZ | RegexPrefixAnalyzer.End | RegexPrefixAnalyzer.Bol)) != 0)
+ if ((_code.FindOptimizations.LeadingAnchor & (RegexPrefixAnalyzer.Beginning | RegexPrefixAnalyzer.Start | RegexPrefixAnalyzer.EndZ | RegexPrefixAnalyzer.End | RegexPrefixAnalyzer.Bol)) != 0)
{
- switch (_leadingAnchor)
+ switch (_code.FindOptimizations.LeadingAnchor)
{
case RegexPrefixAnalyzer.Beginning:
{
}
Ldc(1);
Ret();
- return;
+ return true;
case RegexPrefixAnalyzer.Start:
{
}
Ldc(1);
Ret();
- return;
+ return true;
case RegexPrefixAnalyzer.EndZ:
{
}
Ldc(1);
Ret();
- return;
+ return true;
- case RegexPrefixAnalyzer.End when minRequiredLength == 0: // if it's > 0, we already output a more stringent check
+ case RegexPrefixAnalyzer.End:
{
Label l1 = DefineLabel();
Ldloc(_runtextposLocal);
}
Ldc(1);
Ret();
- return;
+ return true;
- case RegexPrefixAnalyzer.Bol when !_code.RightToLeft: // don't bother optimizing for the niche case of RegexOptions.RightToLeft | RegexOptions.Multiline
+ case RegexPrefixAnalyzer.Bol:
{
// Optimize the handling of a Beginning-Of-Line (BOL) anchor. BOL is special, in that unlike
// other anchors like Beginning, there are potentially multiple places a BOL can match. So unlike
// the other anchors, which all skip all subsequent processing if found, with BOL we just use it
- // to boost our position to the next line, and then continue normally with any Boyer-Moore or
- // leading char class searches.
+ // to boost our position to the next line, and then continue normally with any prefix or char class searches.
+ Debug.Assert(!_code.RightToLeft, "RightToLeft isn't implemented and should have been filtered out previously");
Label atBeginningOfLine = DefineLabel();
// if (runtextpos > runtextbeg...
break;
}
}
- }
-
- void GenerateBoyerMoore(RegexBoyerMoore rbm)
- {
- LocalBuilder limitLocal;
- int beforefirst;
- int last;
- if (!_code.RightToLeft)
- {
- limitLocal = _runtextendLocal;
- beforefirst = -1;
- last = rbm.Pattern.Length - 1;
- }
- else
- {
- limitLocal = _runtextbegLocal!;
- beforefirst = rbm.Pattern.Length;
- last = 0;
- }
-
- int chLast = rbm.Pattern[last];
- // string runtext = this.runtext;
- Mvfldloc(s_runtextField, _runtextLocal);
-
- // runtextpos += pattern.Length - 1; // advance to match last character
- Ldloc(_runtextposLocal);
- if (!_code.RightToLeft)
- {
- Ldc(rbm.Pattern.Length - 1);
- Add();
- }
- else
- {
- Ldc(rbm.Pattern.Length);
- Sub();
- }
- Stloc(_runtextposLocal);
-
- Label lStart = DefineLabel();
- Br(lStart);
-
- // DefaultAdvance:
- // offset = pattern.Length;
- Label lDefaultAdvance = DefineLabel();
- MarkLabel(lDefaultAdvance);
- Ldc(_code.RightToLeft ? -rbm.Pattern.Length : rbm.Pattern.Length);
-
- // Advance:
- // runtextpos += offset;
- Label lAdvance = DefineLabel();
- MarkLabel(lAdvance);
- Ldloc(_runtextposLocal);
- Add();
- Stloc(_runtextposLocal);
-
- // Start:
- // if (runtextpos >= runtextend) goto returnFalse;
- MarkLabel(lStart);
- Ldloc(_runtextposLocal);
- Ldloc(limitLocal);
- if (!_code.RightToLeft)
- {
- BgeFar(returnFalse);
- }
- else
- {
- BltFar(returnFalse);
- }
-
- // ch = runtext[runtextpos];
- Rightchar();
- if (rbm.CaseInsensitive)
- {
- CallToLower();
- }
-
- Label lPartialMatch = DefineLabel();
- using (RentedLocalBuilder chLocal = RentInt32Local())
- {
- Stloc(chLocal);
- Ldloc(chLocal);
- Ldc(chLast);
-
- // if (ch == lastChar) goto partialMatch;
- BeqFar(lPartialMatch);
-
- // ch -= lowAscii;
- // if (ch > (highAscii - lowAscii)) goto defaultAdvance;
- Ldloc(chLocal);
- Ldc(rbm.LowASCII);
- Sub();
- Stloc(chLocal);
- Ldloc(chLocal);
- Ldc(rbm.HighASCII - rbm.LowASCII);
- BgtUn(lDefaultAdvance);
-
- // int offset = "lookupstring"[num];
- // goto advance;
- int negativeRange = rbm.HighASCII - rbm.LowASCII + 1;
- if (negativeRange > 1)
- {
- // Create a string to store the lookup table we use to find the offset.
- Debug.Assert(rbm.Pattern.Length <= char.MaxValue, "RegexBoyerMoore should have limited the size allowed.");
- string negativeLookup = string.Create(negativeRange, (rbm, beforefirst), static (span, state) =>
- {
- // Store the offsets into the string. RightToLeft has negative offsets, so to support it with chars (unsigned), we negate
- // the values to be stored in the string, and then at run time after looking up the offset in the string, negate it again.
- for (int i = 0; i < span.Length; i++)
- {
- int offset = state.rbm.NegativeASCII[i + state.rbm.LowASCII];
- if (offset == state.beforefirst)
- {
- offset = state.rbm.Pattern.Length;
- }
- else if (state.rbm.RightToLeft)
- {
- offset = -offset;
- }
- Debug.Assert(offset >= 0 && offset <= char.MaxValue);
- span[i] = (char)offset;
- }
- });
-
- // offset = lookupString[ch];
- // goto Advance;
- Ldstr(negativeLookup);
- Ldloc(chLocal);
- Call(s_stringGetCharsMethod);
- if (_code.RightToLeft)
- {
- Neg();
- }
- }
- else
- {
- // offset = value;
- Debug.Assert(negativeRange == 1);
- int offset = rbm.NegativeASCII[rbm.LowASCII];
- if (offset == beforefirst)
- {
- offset = _code.RightToLeft ? -rbm.Pattern.Length : rbm.Pattern.Length;
- }
- Ldc(offset);
- }
- BrFar(lAdvance);
- }
-
- // Emit a check for each character from the next to last down to the first.
- MarkLabel(lPartialMatch);
- Ldloc(_runtextposLocal);
- using (RentedLocalBuilder testLocal = RentInt32Local())
- {
- Stloc(testLocal);
-
- int prevLabelOffset = int.MaxValue;
- Label prevLabel = default;
- for (int i = rbm.Pattern.Length - 2; i >= 0; i--)
- {
- int charindex = _code.RightToLeft ? rbm.Pattern.Length - 1 - i : i;
-
- // if (runtext[--test] == pattern[index]) goto lNext;
- Ldloc(_runtextLocal);
- Ldloc(testLocal);
- Ldc(1);
- Sub(_code.RightToLeft);
- Stloc(testLocal);
- Ldloc(testLocal);
- Call(s_stringGetCharsMethod);
- if (rbm.CaseInsensitive && RegexCharClass.ParticipatesInCaseConversion(rbm.Pattern[charindex]))
- {
- CallToLower();
- }
- Ldc(rbm.Pattern[charindex]);
-
- if (prevLabelOffset == rbm.Positive[charindex])
- {
- BneFar(prevLabel);
- }
- else
- {
- Label lNext = DefineLabel();
- Beq(lNext);
-
- // offset = positive[ch];
- // goto advance;
- prevLabel = DefineLabel();
- prevLabelOffset = rbm.Positive[charindex];
- MarkLabel(prevLabel);
- Ldc(prevLabelOffset);
- BrFar(lAdvance);
-
- MarkLabel(lNext);
- }
- }
-
- // this.runtextpos = test;
- // return true;
- Ldthis();
- Ldloc(testLocal);
- if (_code.RightToLeft)
- {
- Ldc(1);
- Add();
- }
- Stfld(s_runtextposField);
- Ldc(1);
- Ret();
- }
+ return false;
}
- void GenerateIndexOf(string prefix)
+ void GenerateIndexOf_LeftToRight(string prefix)
{
using RentedLocalBuilder i = RentInt32Local();
Call(s_spanIndexOfSpan);
Stloc(i);
- // if (i < 0)
- // {
- // base.runtextpos = runtextend;
- // return false;
- // }
+ // if (i < 0) goto ReturnFalse;
Ldloc(i);
Ldc(0);
BltFar(returnFalse);
Ret();
}
- void GenerateLeadingCharacter_RightToLeft()
+ void GenerateIndexOf_RightToLeft(string prefix)
{
- Debug.Assert(_leadingCharClasses.Length == 1, "Only the FirstChars and not MultiFirstChars computation is supported for RightToLeft");
-
- using RentedLocalBuilder cLocal = RentInt32Local();
-
- Label l1 = DefineLabel();
- Label l2 = DefineLabel();
- Label l3 = DefineLabel();
- Label l4 = DefineLabel();
- Label l5 = DefineLabel();
-
- Mvfldloc(s_runtextField, _runtextLocal);
+ using RentedLocalBuilder i = RentInt32Local();
+ // int i = runtext.AsSpan(runtextpos, runtextbeg, runtextpos - runtextbeg).LastIndexOf(prefix);
+ Ldthis();
+ Ldfld(s_runtextField);
+ Ldloc(_runtextbegLocal!);
Ldloc(_runtextposLocal);
Ldloc(_runtextbegLocal!);
Sub();
- Stloc(cLocal);
+ Call(s_stringAsSpanIntIntMethod);
+ Ldstr(prefix);
+ Call(s_stringAsSpanMethod);
+ Call(s_spanLastIndexOfSpan);
+ Stloc(i);
- if (minRequiredLength == 0) // if minRequiredLength > 0, we already output a more stringent check
- {
- Ldloc(cLocal);
- Ldc(0);
- BleFar(l4);
- }
+ // if (i < 0) goto ReturnFalse;
+ Ldloc(i);
+ Ldc(0);
+ BltFar(returnFalse);
- MarkLabel(l1);
- Ldloc(cLocal);
+ // base.runtextpos = runtextbeg + i + LeadingCaseSensitivePrefix.Length;
+ // return true;
+ Ldthis();
+ Ldloc(_runtextbegLocal!);
+ Ldloc(i);
+ Add();
+ Ldc(prefix.Length);
+ Add();
+ Stfld(s_runtextposField);
Ldc(1);
- Sub();
- Stloc(cLocal);
+ Ret();
+ }
- Leftcharnext();
+ void GenerateFixedSet_RightToLeft()
+ {
+ (char[]? Chars, string Set, int Distance, bool CaseInsensitive) set = _code.FindOptimizations.FixedDistanceSets![0];
+ Debug.Assert(set.Distance == 0);
- if (!RegexCharClass.IsSingleton(_leadingCharClasses[0].CharClass))
- {
- EmitMatchCharacterClass(_leadingCharClasses[0].CharClass, _leadingCharClasses[0].CaseInsensitive);
- Brtrue(l2);
- }
- else
+ using RentedLocalBuilder i = RentInt32Local();
+
+ if (set.Chars is { Length: 1 } && !set.CaseInsensitive)
{
- Ldc(RegexCharClass.SingletonChar(_leadingCharClasses[0].CharClass));
- Beq(l2);
- }
+ // int i = runtext.AsSpan(runtextpos, runtextbeg, runtextpos - runtextbeg).LastIndexOf(set.Chars[0]);
+ Ldthis();
+ Ldfld(s_runtextField);
+ Ldloc(_runtextbegLocal!);
+ Ldloc(_runtextposLocal);
+ Ldloc(_runtextbegLocal!);
+ Sub();
+ Call(s_stringAsSpanIntIntMethod);
+ Ldc(set.Chars[0]);
+ Call(s_spanLastIndexOfChar);
+ Stloc(i);
- MarkLabel(l5);
+ // if (i < 0) goto ReturnFalse;
+ Ldloc(i);
+ Ldc(0);
+ BltFar(returnFalse);
- Ldloc(cLocal);
- Ldc(0);
- if (!RegexCharClass.IsSingleton(_leadingCharClasses[0].CharClass))
- {
- BgtFar(l1);
+ // base.runtextpos = runtextbeg + i + 1;
+ // return true;
+ Ldthis();
+ Ldloc(_runtextbegLocal!);
+ Ldloc(i);
+ Add();
+ Ldc(1);
+ Add();
+ Stfld(s_runtextposField);
+ Ldc(1);
+ Ret();
}
else
{
- Bgt(l1);
- }
+ Label condition = DefineLabel();
+ Label increment = DefineLabel();
+ Label body = DefineLabel();
- Ldc(0);
- Br(l3);
+ Mvfldloc(s_runtextField, _runtextLocal);
- MarkLabel(l2);
-
- Ldloc(_runtextposLocal);
- Ldc(1);
- Sub(_code.RightToLeft);
- Stloc(_runtextposLocal);
- Ldc(1);
+ // for (int i = runtextpos - 1; ...
+ Ldloc(_runtextposLocal);
+ Ldc(1);
+ Sub();
+ Stloc(i);
+ BrFar(condition);
+
+ // if (MatchCharClass(runtext[i], set))
+ MarkLabel(body);
+ Ldloc(_runtextLocal);
+ Ldloc(i);
+ Call(s_stringGetCharsMethod);
+ EmitMatchCharacterClass(set.Set, set.CaseInsensitive);
+ Brfalse(increment);
+
+ // base.runtextpos = i + 1;
+ // return true;
+ Ldthis();
+ Ldloc(i);
+ Ldc(1);
+ Add();
+ Stfld(s_runtextposField);
+ Ldc(1);
+ Ret();
- MarkLabel(l3);
+ // for (...; ...; i--)
+ MarkLabel(increment);
+ Ldloc(i);
+ Ldc(1);
+ Sub();
+ Stloc(i);
- Mvlocfld(_runtextposLocal, s_runtextposField);
- Ret();
+ // for (...; i >= runtextbeg; ...)
+ MarkLabel(condition);
+ Ldloc(i);
+ Ldloc(_runtextbegLocal!);
+ BgeFar(body);
- MarkLabel(l4);
- Ldc(0);
- Ret();
+ BrFar(returnFalse);
+ }
}
- void GenerateLeadingCharacter_LeftToRight()
+ void GenerateFixedSet_LeftToRight()
{
- Debug.Assert(_leadingCharClasses != null && _leadingCharClasses.Length > 0);
-
- // If minRequiredLength > 0, we already output a more stringent check. In the rare case
- // where we were unable to get an accurate enough min required length to ensure it's larger
- // than the prefixes we calculated, we also need to ensure we have enough spaces for those,
- // as they also represent a min required length.
- if (minRequiredLength < _leadingCharClasses.Length)
- {
- // if (runtextpos >= runtextend - (_leadingCharClasses.Length - 1)) goto returnFalse;
- Ldloc(_runtextendLocal);
- if (_leadingCharClasses.Length > 1)
- {
- Ldc(_leadingCharClasses.Length - 1);
- Sub();
- }
- Ldloc(_runtextposLocal);
- BleFar(returnFalse);
- }
+ List<(char[]? Chars, string Set, int Distance, bool CaseInsensitive)>? sets = _code.FindOptimizations.FixedDistanceSets;
+ (char[]? Chars, string Set, int Distance, bool CaseInsensitive) primarySet = sets![0];
+ const int MaxSets = 4;
+ int setsToUse = Math.Min(sets.Count, MaxSets);
using RentedLocalBuilder iLocal = RentInt32Local();
using RentedLocalBuilder textSpanLocal = RentReadOnlySpanCharLocal();
// If we can use IndexOf{Any}, try to accelerate the skip loop via vectorization to match the first prefix.
// We can use it if this is a case-sensitive class with a small number of characters in the class.
- Span<char> setChars = stackalloc char[3]; // up to 3 characters handled by IndexOf{Any} below
- int setCharsCount = 0, charClassIndex = 0;
- bool canUseIndexOf =
- !_leadingCharClasses[0].CaseInsensitive &&
- (setCharsCount = RegexCharClass.GetSetChars(_leadingCharClasses[0].CharClass, setChars)) > 0 &&
- !RegexCharClass.IsNegated(_leadingCharClasses[0].CharClass);
- bool needLoop = !canUseIndexOf || _leadingCharClasses.Length > 1;
+ int setIndex = 0;
+ bool canUseIndexOf = !primarySet.CaseInsensitive && primarySet.Chars is not null;
+ bool needLoop = !canUseIndexOf || setsToUse > 1;
Label checkSpanLengthLabel = default;
Label charNotInClassLabel = default;
if (canUseIndexOf)
{
- charClassIndex = 1;
+ setIndex = 1;
if (needLoop)
{
- // textSpan.Slice(iLocal)
+ // textSpan.Slice(iLocal + primarySet.Distance);
Ldloca(textSpanLocal);
Ldloc(iLocal);
+ if (primarySet.Distance != 0)
+ {
+ Ldc(primarySet.Distance);
+ Add();
+ }
+ Call(s_spanSliceIntMethod);
+ }
+ else if (primarySet.Distance != 0)
+ {
+ // textSpan.Slice(primarySet.Distance)
+ Ldloca(textSpanLocal);
+ Ldc(primarySet.Distance);
Call(s_spanSliceIntMethod);
}
else
Ldloc(textSpanLocal);
}
- switch (setCharsCount)
+ switch (primarySet.Chars!.Length)
{
case 1:
// tmp = ...IndexOf(setChars[0]);
- Ldc(setChars[0]);
+ Ldc(primarySet.Chars[0]);
Call(s_spanIndexOfChar);
break;
case 2:
// tmp = ...IndexOfAny(setChars[0], setChars[1]);
- Ldc(setChars[0]);
- Ldc(setChars[1]);
+ Ldc(primarySet.Chars[0]);
+ Ldc(primarySet.Chars[1]);
Call(s_spanIndexOfAnyCharChar);
break;
- default: // 3
+ case 3:
// tmp = ...IndexOfAny(setChars[0], setChars[1], setChars[2]});
- Debug.Assert(setCharsCount == 3);
- Ldc(setChars[0]);
- Ldc(setChars[1]);
- Ldc(setChars[2]);
+ Ldc(primarySet.Chars[0]);
+ Ldc(primarySet.Chars[1]);
+ Ldc(primarySet.Chars[2]);
Call(s_spanIndexOfAnyCharCharChar);
break;
+
+ default:
+ Ldstr(new string(primarySet.Chars));
+ Call(s_stringAsSpanMethod);
+ Call(s_spanIndexOfAnySpan);
+ break;
}
if (needLoop)
BltFar(returnFalse);
}
- // if (i >= textSpan.Length - (_leadingCharClasses.Length - 1)) goto returnFalse;
- if (_leadingCharClasses.Length > 1)
+ // if (i >= textSpan.Length - (minRequiredLength - 1)) goto returnFalse;
+ if (sets.Count > 1)
{
Debug.Assert(needLoop);
Ldloca(textSpanLocal);
Call(s_spanGetLengthMethod);
- Ldc(_leadingCharClasses.Length - 1);
+ Ldc(minRequiredLength - 1);
Sub();
Ldloc(iLocal);
BleFar(returnFalse);
// if (!CharInClass(textSpan[i + 1], prefix[1], "...")) continue;
// if (!CharInClass(textSpan[i + 2], prefix[2], "...")) continue;
// ...
- Debug.Assert(charClassIndex == 0 || charClassIndex == 1);
- for ( ; charClassIndex < _leadingCharClasses.Length; charClassIndex++)
+ Debug.Assert(setIndex == 0 || setIndex == 1);
+ for ( ; setIndex < sets.Count; setIndex++)
{
Debug.Assert(needLoop);
Ldloca(textSpanLocal);
Ldloc(iLocal);
- if (charClassIndex > 0)
+ if (sets[setIndex].Distance != 0)
{
- Ldc(charClassIndex);
+ Ldc(sets[setIndex].Distance);
Add();
}
Call(s_spanGetItemMethod);
LdindU2();
- EmitMatchCharacterClass(_leadingCharClasses[charClassIndex].CharClass, _leadingCharClasses[charClassIndex].CaseInsensitive);
+ EmitMatchCharacterClass(sets[setIndex].Set, sets[setIndex].CaseInsensitive);
BrfalseFar(charNotInClassLabel);
}
Add();
Stloc(iLocal);
- // for (...; i < span.Length - (_leadingCharClasses.Length - 1); ...);
+ // for (...; i < span.Length - (minRequiredLength - 1); ...);
MarkLabel(checkSpanLengthLabel);
Ldloc(iLocal);
Ldloca(textSpanLocal);
Call(s_spanGetLengthMethod);
- if (_leadingCharClasses.Length > 1)
+ if (setsToUse > 1 || primarySet.Distance != 0)
{
- Ldc(_leadingCharClasses.Length - 1);
+ Ldc(minRequiredLength - 1);
Sub();
}
BltFar(loopBody);
break;
case RegexNode.Concatenate:
- int childCount = node.ChildCount();
- for (int i = 0; i < childCount; i++)
- {
- if (emitLengthChecksIfRequired && node.TryGetJoinableLengthCheckChildRange(i, out int requiredLength, out int exclusiveEnd))
- {
- EmitSpanLengthCheck(requiredLength);
- for (; i < exclusiveEnd; i++)
- {
- EmitNode(node.Child(i), i + 1 < childCount ? node.Child(i + 1) : subsequent, emitLengthChecksIfRequired: false);
- }
-
- i--;
- continue;
- }
-
- EmitNode(node.Child(i), i + 1 < childCount ? node.Child(i + 1) : subsequent);
- }
+ EmitConcatenation(node, subsequent, emitLengthChecksIfRequired);
break;
case RegexNode.Capture:
Stfld(s_runtextposField);
}
+ // Emits code for a concatenation
+ void EmitConcatenation(RegexNode node, RegexNode? subsequent, bool emitLengthChecksIfRequired)
+ {
+ int childCount = node.ChildCount();
+ for (int i = 0; i < childCount; i++)
+ {
+ if (emitLengthChecksIfRequired && node.TryGetJoinableLengthCheckChildRange(i, out int requiredLength, out int exclusiveEnd))
+ {
+ EmitSpanLengthCheck(requiredLength);
+ for (; i < exclusiveEnd; i++)
+ {
+ EmitNode(node.Child(i), i + 1 < childCount ? node.Child(i + 1) : subsequent, emitLengthChecksIfRequired: false);
+ }
+
+ i--;
+ continue;
+ }
+
+ EmitNode(node.Child(i), i + 1 < childCount ? node.Child(i + 1) : subsequent);
+ }
+ }
+
// Emits the code to handle a single-character match.
void EmitSingleChar(RegexNode node, bool emitLengthCheck = true, LocalBuilder? offset = null)
{
}
else
{
- if (IsCaseInsensitive(node) && RegexCharClass.ParticipatesInCaseConversion(node.Ch))
+ if (IsCaseInsensitive(node))
{
CallToLower();
}
EmitTextSpanOffset();
textSpanPos++;
LdindU2();
- if (caseInsensitive && RegexCharClass.ParticipatesInCaseConversion(s[i]))
+ if (caseInsensitive)
{
CallToLower();
}
Label atomicLoopDoneLabel = DefineLabel();
- Span<char> setChars = stackalloc char[3]; // 3 is max we can use with IndexOfAny
+ Span<char> setChars = stackalloc char[5]; // max optimized by IndexOfAny today
int numSetChars = 0;
if (node.IsNotoneFamily &&
maxIterations == int.MaxValue &&
- (!IsCaseInsensitive(node) || !RegexCharClass.ParticipatesInCaseConversion(node.Ch)))
+ (!IsCaseInsensitive(node)))
{
// For Notone, we're looking for a specific character, as everything until we find
// it is consumed by the loop. If we're unbounded, such as with ".*" and if we're case-sensitive,
else if (node.IsSetFamily &&
maxIterations == int.MaxValue &&
!IsCaseInsensitive(node) &&
- (numSetChars = RegexCharClass.GetSetChars(node.Str!, setChars)) > 1 &&
+ (numSetChars = RegexCharClass.GetSetChars(node.Str!, setChars)) != 0 &&
RegexCharClass.IsNegated(node.Str!))
{
- // If the set is negated and contains only 2 or 3 characters (if it contained 1 and was negated, it would
+ // If the set is negated and contains only a few characters (if it contained 1 and was negated, it would
// have been reduced to a Notone), we can use an IndexOfAny to find any of the target characters.
// As with the notoneloopatomic above, the unbounded constraint is purely for simplicity.
+ Debug.Assert(numSetChars > 1);
- // int i = textSpan.Slice(textSpanPos).IndexOfAny(ch1, ch2{, ch3});
+ // int i = textSpan.Slice(textSpanPos).IndexOfAny(ch1, ch2, ...);
if (textSpanPos > 0)
{
Ldloca(textSpanLocal);
{
Ldloc(textSpanLocal);
}
- Ldc(setChars[0]);
- Ldc(setChars[1]);
- if (numSetChars == 2)
- {
- Call(s_spanIndexOfAnyCharChar);
- }
- else
+ switch (numSetChars)
{
- Debug.Assert(numSetChars == 3);
- Ldc(setChars[2]);
- Call(s_spanIndexOfAnyCharCharChar);
+ case 2:
+ Ldc(setChars[0]);
+ Ldc(setChars[1]);
+ Call(s_spanIndexOfAnyCharChar);
+ break;
+
+ case 3:
+ Ldc(setChars[0]);
+ Ldc(setChars[1]);
+ Ldc(setChars[2]);
+ Call(s_spanIndexOfAnyCharCharChar);
+ break;
+
+ default:
+ Ldstr(setChars.Slice(0, numSetChars).ToString());
+ Call(s_stringAsSpanMethod);
+ Call(s_spanIndexOfSpan);
+ break;
}
Stloc(iterationLocal);
}
else
{
- if (IsCaseInsensitive(node) && RegexCharClass.ParticipatesInCaseConversion(node.Ch))
+ if (IsCaseInsensitive(node))
{
CallToLower();
}
}
else
{
- if (IsCaseInsensitive(node) && RegexCharClass.ParticipatesInCaseConversion(node.Ch))
+ if (IsCaseInsensitive(node))
{
CallToLower();
}
}
else
{
- if (IsCaseInsensitive() && RegexCharClass.ParticipatesInCaseConversion(Operand(0)))
+ if (IsCaseInsensitive())
{
CallToLower();
}
Add();
}
Call(s_stringGetCharsMethod);
- if (IsCaseInsensitive() && RegexCharClass.ParticipatesInCaseConversion(str[i]))
+ if (IsCaseInsensitive())
{
CallToLower();
}
Ldc(str.Length - i);
Sub();
Call(s_stringGetCharsMethod);
- if (IsCaseInsensitive() && RegexCharClass.ParticipatesInCaseConversion(str[i]))
+ if (IsCaseInsensitive())
{
CallToLower();
}
}
else
{
- if (IsCaseInsensitive() && RegexCharClass.ParticipatesInCaseConversion(Operand(0)))
+ if (IsCaseInsensitive())
{
CallToLower();
}
Label loopEnd = DefineLabel();
string? set = Code() == RegexCode.Setloop || Code() == RegexCode.Setloopatomic ? _strings![Operand(0)] : null;
- Span<char> setChars = stackalloc char[3];
+ Span<char> setChars = stackalloc char[5]; // max optimized by IndexOfAny today
int numSetChars;
// If this is a notoneloop{atomic} and we're left-to-right and case-sensitive,
// we can use the vectorized IndexOf to search for the target character.
if ((Code() == RegexCode.Notoneloop || Code() == RegexCode.Notoneloopatomic) &&
!IsRightToLeft() &&
- (!IsCaseInsensitive() || !RegexCharClass.ParticipatesInCaseConversion(Operand(0))))
+ (!IsCaseInsensitive()))
{
// i = runtext.AsSpan(runtextpos, len).IndexOf(ch);
Ldloc(_runtextLocal!);
else if ((Code() == RegexCode.Setloop || Code() == RegexCode.Setloopatomic) &&
!IsRightToLeft() &&
!IsCaseInsensitive() &&
- (numSetChars = RegexCharClass.GetSetChars(set!, setChars)) > 1 &&
+ (numSetChars = RegexCharClass.GetSetChars(set!, setChars)) != 0 &&
RegexCharClass.IsNegated(set!))
{
// Similarly, if this is a setloop{atomic} and we're left-to-right and case-sensitive,
- // and if the set contains only 2 or 3 negated chars, we can use the vectorized IndexOfAny
+ // and if the set contains only a few negated chars, we can use the vectorized IndexOfAny
// to search for those chars.
+ Debug.Assert(numSetChars > 1);
// i = runtext.AsSpan(runtextpos, len).IndexOfAny(ch1, ch2{, ch3});
Ldloc(_runtextLocal!);
Ldloc(_runtextposLocal!);
Ldloc(lenLocal);
Call(s_stringAsSpanIntIntMethod);
- Ldc(setChars[0]);
- Ldc(setChars[1]);
- if (numSetChars == 2)
- {
- Call(s_spanIndexOfAnyCharChar);
- }
- else
+ switch (numSetChars)
{
- Debug.Assert(numSetChars == 3);
- Ldc(setChars[2]);
- Call(s_spanIndexOfAnyCharCharChar);
+ case 2:
+ Ldc(setChars[0]);
+ Ldc(setChars[1]);
+ Call(s_spanIndexOfAnyCharChar);
+ break;
+
+ case 3:
+ Ldc(setChars[0]);
+ Ldc(setChars[1]);
+ Ldc(setChars[2]);
+ Call(s_spanIndexOfAnyCharCharChar);
+ break;
+
+ default:
+ Ldstr(setChars.Slice(0, numSetChars).ToString());
+ Call(s_stringAsSpanMethod);
+ Call(s_spanIndexOfSpan);
+ break;
}
Stloc(iLocal);
}
else
{
- if (IsCaseInsensitive() && RegexCharClass.ParticipatesInCaseConversion(Operand(0)))
+ if (IsCaseInsensitive())
{
CallToLower();
}
}
else
{
- if (IsCaseInsensitive() && RegexCharClass.ParticipatesInCaseConversion(Operand(0)))
+ if (IsCaseInsensitive())
{
CallToLower();
}
// Next, if there's only 2 or 3 chars in the set (fairly common due to the sets we create for prefixes),
// it's cheaper and smaller to compare against each than it is to use a lookup table.
- if (!invariant)
+ if (!invariant && !RegexCharClass.IsNegated(charClass))
{
- Span<char> setChars = stackalloc char[3];
+ Span<char> setChars = stackalloc char[4];
int numChars = RegexCharClass.GetSetChars(charClass, setChars);
- if (numChars > 0 && !RegexCharClass.IsNegated(charClass))
+ if (numChars is 2 or 3)
{
- // (ch == setChars[0]) | (ch == setChars[1]) { | (ch == setChars[2]) }
- Debug.Assert(numChars == 2 || numChars == 3);
- Ldloc(tempLocal);
- Ldc(setChars[0]);
- Ceq();
- Ldloc(tempLocal);
- Ldc(setChars[1]);
- Ceq();
- Or();
+ if ((setChars[0] | 0x20) == setChars[1]) // special-case common case of an upper and lowercase ASCII letter combination
+ {
+ // ((ch | 0x20) == setChars[1])
+ Ldloc(tempLocal);
+ Ldc(0x20);
+ Or();
+ Ldc(setChars[1]);
+ Ceq();
+ }
+ else
+ {
+ // (ch == setChars[0]) | (ch == setChars[1])
+ Ldloc(tempLocal);
+ Ldc(setChars[0]);
+ Ceq();
+ Ldloc(tempLocal);
+ Ldc(setChars[1]);
+ Ceq();
+ Or();
+ }
+
+ // | (ch == setChars[2])
if (numChars == 3)
{
Ldloc(tempLocal);
return;
}
+ else if (numChars == 4 &&
+ (setChars[0] | 0x20) == setChars[1] &&
+ (setChars[2] | 0x20) == setChars[3])
+ {
+ // ((ch | 0x20) == setChars[1])
+ Ldloc(tempLocal);
+ Ldc(0x20);
+ Or();
+ Ldc(setChars[1]);
+ Ceq();
+
+ // ((ch | 0x20) == setChars[3])
+ Ldloc(tempLocal);
+ Ldc(0x20);
+ Or();
+ Ldc(setChars[3]);
+ Ceq();
+
+ Or();
+ return;
+ }
}
using RentedLocalBuilder resultLocal = RentInt32Local();
--- /dev/null
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+
+using System.Collections.Generic;
+using System.Diagnostics;
+using System.Globalization;
+
+namespace System.Text.RegularExpressions
+{
+ /// <summary>Contains state and provides operations related to finding the next location a match could possibly begin.</summary>
+ internal sealed class RegexFindOptimizations
+ {
+ /// <summary>The minimum required length an input need be to match the pattern. May be 0.</summary>
+ private readonly int _minRequiredLength;
+ /// <summary>True if the input should be processed right-to-left rather than left-to-right.</summary>
+ private readonly bool _rightToLeft;
+ /// <summary>Provides the ToLower routine for lowercasing characters.</summary>
+ private readonly TextInfo _textInfo;
+ /// <summary>Lookup table used for optimizing ASCII when doing set queries.</summary>
+ private readonly uint[]?[]? _asciiLookups;
+
+ public RegexFindOptimizations(RegexTree tree, CultureInfo culture)
+ {
+ _rightToLeft = (tree.Options & RegexOptions.RightToLeft) != 0;
+ _minRequiredLength = tree.MinRequiredLength;
+ _textInfo = culture.TextInfo;
+
+ // Compute any anchor starting the expression. If there is one, we won't need to search for anything,
+ // as we can just match at that single location.
+ LeadingAnchor = RegexPrefixAnalyzer.FindLeadingAnchor(tree);
+ if (_rightToLeft)
+ {
+ // Filter out Bol for RightToLeft, as we don't currently optimize for it.
+ LeadingAnchor &= ~RegexPrefixAnalyzer.Bol;
+ }
+ if ((LeadingAnchor & (RegexPrefixAnalyzer.Beginning | RegexPrefixAnalyzer.Start | RegexPrefixAnalyzer.EndZ | RegexPrefixAnalyzer.End)) != 0)
+ {
+ FindMode = (LeadingAnchor, _rightToLeft) switch
+ {
+ (RegexPrefixAnalyzer.Beginning, false) => FindNextStartingPositionMode.LeadingAnchor_LeftToRight_Beginning,
+ (RegexPrefixAnalyzer.Beginning, true) => FindNextStartingPositionMode.LeadingAnchor_RightToLeft_Beginning,
+ (RegexPrefixAnalyzer.Start, false) => FindNextStartingPositionMode.LeadingAnchor_LeftToRight_Start,
+ (RegexPrefixAnalyzer.Start, true) => FindNextStartingPositionMode.LeadingAnchor_RightToLeft_Start,
+ (RegexPrefixAnalyzer.End, false) => FindNextStartingPositionMode.LeadingAnchor_LeftToRight_End,
+ (RegexPrefixAnalyzer.End, true) => FindNextStartingPositionMode.LeadingAnchor_RightToLeft_End,
+ (_, false) => FindNextStartingPositionMode.LeadingAnchor_LeftToRight_EndZ,
+ (_, true) => FindNextStartingPositionMode.LeadingAnchor_RightToLeft_EndZ,
+ };
+ return;
+ }
+
+ // If there's a leading case-sensitive substring, just use IndexOf and inherit all of its optimizations.
+ string caseSensitivePrefix = RegexPrefixAnalyzer.FindCaseSensitivePrefix(tree);
+ if (caseSensitivePrefix.Length > 1)
+ {
+ LeadingCaseSensitivePrefix = caseSensitivePrefix;
+ FindMode = _rightToLeft ?
+ FindNextStartingPositionMode.LeadingPrefix_RightToLeft_CaseSensitive :
+ FindNextStartingPositionMode.LeadingPrefix_LeftToRight_CaseSensitive;
+ return;
+ }
+
+ // At this point there are no fast-searchable anchors or case-sensitive prefixes. We can now analyze the
+ // pattern for sets and then use any found sets to determine what kind of search to perform.
+
+ // If we're compiling, then the compilation process already handles sets that reduce to a single literal,
+ // so we can simplify and just always go for the sets.
+ bool dfa = (tree.Options & RegexOptions.NonBacktracking) != 0;
+ bool compiled = (tree.Options & RegexOptions.Compiled) != 0 && !dfa; // for now, we never generate code for NonBacktracking, so treat it as non-compiled
+ bool interpreter = !compiled && !dfa;
+
+ // For interpreter, we want to employ optimizations, but we don't want to make construction significantly
+ // more expensive; someone who wants to pay to do more work can specify Compiled. So for the interpreter
+ // we focus only on creating a set for the first character. Same for right-to-left, which is used very
+ // rarely and thus we don't need to invest in special-casing it.
+ if (_rightToLeft)
+ {
+ // Determine a set for anything that can possibly start the expression.
+ if (RegexPrefixAnalyzer.FindFirstCharClass(tree, culture) is (string CharClass, bool CaseInsensitive) set)
+ {
+ // See if the set is limited to holding only a few characters.
+ Span<char> scratch = stackalloc char[5]; // max optimized by IndexOfAny today
+ int scratchCount;
+ char[]? chars = null;
+ if (!RegexCharClass.IsNegated(set.CharClass) &&
+ (scratchCount = RegexCharClass.GetSetChars(set.CharClass, scratch)) > 0)
+ {
+ chars = scratch.Slice(0, scratchCount).ToArray();
+ }
+
+ if (!compiled &&
+ chars is { Length: 1 })
+ {
+ // The set contains one and only one character, meaning every match starts
+ // with the same literal value (potentially case-insensitive). Search for that.
+ FixedDistanceLiteral = (chars[0], 0);
+ FindMode = (_rightToLeft, set.CaseInsensitive) switch
+ {
+ (false, false) => FindNextStartingPositionMode.FixedLiteral_LeftToRight_CaseSensitive,
+ (false, true) => FindNextStartingPositionMode.FixedLiteral_LeftToRight_CaseInsensitive,
+ (true, false) => FindNextStartingPositionMode.LeadingLiteral_RightToLeft_CaseSensitive,
+ (true, true) => FindNextStartingPositionMode.LeadingLiteral_RightToLeft_CaseInsensitive,
+ };
+ }
+ else
+ {
+ // The set may match multiple characters. Search for that.
+ FixedDistanceSets = new() { (chars, set.CharClass, 0, set.CaseInsensitive) };
+ FindMode = (_rightToLeft, set.CaseInsensitive) switch
+ {
+ (false, false) => FindNextStartingPositionMode.LeadingSet_LeftToRight_CaseSensitive,
+ (false, true) => FindNextStartingPositionMode.LeadingSet_LeftToRight_CaseInsensitive,
+ (true, false) => FindNextStartingPositionMode.LeadingSet_RightToLeft_CaseSensitive,
+ (true, true) => FindNextStartingPositionMode.LeadingSet_RightToLeft_CaseInsensitive,
+ };
+ _asciiLookups = new uint[1][];
+ }
+ }
+ return;
+ }
+
+ // We're now left-to-right only and looking for sets.
+
+ // Build up a list of all of the sets that are a fixed distance from the start of the expression.
+ List<(char[]? Chars, string Set, int Distance, bool CaseInsensitive)>? fixedDistanceSets = RegexPrefixAnalyzer.FindFixedDistanceSets(tree, culture, thorough: !interpreter);
+ if (fixedDistanceSets is not null)
+ {
+ Debug.Assert(fixedDistanceSets.Count != 0);
+
+ // Determine whether to do searching based on one or more sets or on a single literal. Compiled engines
+ // don't need to special-case literals as they already do codegen to create the optimal lookup based on
+ // the set's characteristics.
+ if (!compiled &&
+ fixedDistanceSets.Count == 1 &&
+ fixedDistanceSets[0].Chars is { Length: 1 })
+ {
+ FixedDistanceLiteral = (fixedDistanceSets[0].Chars![0], fixedDistanceSets[0].Distance);
+ FindMode = fixedDistanceSets[0].CaseInsensitive ?
+ FindNextStartingPositionMode.FixedLiteral_LeftToRight_CaseInsensitive :
+ FindNextStartingPositionMode.FixedLiteral_LeftToRight_CaseSensitive;
+ }
+ else
+ {
+ // Limit how many sets we use to avoid doing lots of unnecessary work. The list was already
+ // sorted from best to worst, so just keep the first ones up to our limit.
+ const int MaxSetsToUse = 3; // arbitrary tuned limit
+ if (fixedDistanceSets.Count > MaxSetsToUse)
+ {
+ fixedDistanceSets.RemoveRange(MaxSetsToUse, fixedDistanceSets.Count - MaxSetsToUse);
+ }
+
+ // Store the sets, and compute which mode to use.
+ FixedDistanceSets = fixedDistanceSets;
+ FindMode = (fixedDistanceSets.Count == 1 && fixedDistanceSets[0].Distance == 0, fixedDistanceSets[0].CaseInsensitive) switch
+ {
+ (true, true) => FindNextStartingPositionMode.LeadingSet_LeftToRight_CaseInsensitive,
+ (true, false) => FindNextStartingPositionMode.LeadingSet_LeftToRight_CaseSensitive,
+ (false, true) => FindNextStartingPositionMode.FixedSets_LeftToRight_CaseInsensitive,
+ (false, false) => FindNextStartingPositionMode.FixedSets_LeftToRight_CaseSensitive,
+ };
+ _asciiLookups = new uint[fixedDistanceSets.Count][];
+ }
+ return;
+ }
+ }
+
+ /// <summary>Gets the selected mode for performing the next <see cref="TryFindNextStartingPosition"/> operation</summary>
+ public FindNextStartingPositionMode FindMode { get; } = FindNextStartingPositionMode.NoSearch;
+
+ /// <summary>Gets the leading anchor, if one exists (RegexPrefixAnalyzer.Bol, etc).</summary>
+ public int LeadingAnchor { get; }
+
+ /// <summary>Gets the leading prefix. May be an empty string.</summary>
+ public string LeadingCaseSensitivePrefix { get; } = string.Empty;
+
+ /// <summary>When in fixed distance literal mode, gets the literal and how far it is from the start of the pattern.</summary>
+ public (char Literal, int Distance) FixedDistanceLiteral { get; }
+
+ /// <summary>When in fixed distance set mode, gets the set and how far it is from the start of the pattern.</summary>
+ /// <remarks>The case-insensitivity of the 0th entry will always match the mode selected, but subsequent entries may not.</remarks>
+ public List<(char[]? Chars, string Set, int Distance, bool CaseInsensitive)>? FixedDistanceSets { get; }
+
+ /// <summary>Try to advance to the next starting position that might be a location for a match.</summary>
+ /// <param name="text">The text to search.</param>
+ /// <param name="pos">The position in <paramref name="text"/>. This is updated with the found position.</param>
+ /// <param name="beginning">The index in <paramref name="text"/> to consider the beginning for beginning anchor purposes.</param>
+ /// <param name="start">The index in <paramref name="text"/> to consider the start for start anchor purposes.</param>
+ /// <param name="end">The index in <paramref name="text"/> to consider the non-inclusive end of the string.</param>
+ /// <returns>true if a position to attempt a match was found; false if none was found.</returns>
+ public bool TryFindNextStartingPosition(string text, ref int pos, int beginning, int start, int end)
+ {
+ // Return early if we know there's not enough input left to match.
+ if (!_rightToLeft)
+ {
+ if (pos > end - _minRequiredLength)
+ {
+ pos = end;
+ return false;
+ }
+ }
+ else
+ {
+ if (pos - _minRequiredLength < beginning)
+ {
+ pos = beginning;
+ return false;
+ }
+ }
+
+ // Optimize the handling of a Beginning-Of-Line (BOL) anchor (only for left-to-right). BOL is special, in that unlike
+ // other anchors like Beginning, there are potentially multiple places a BOL can match. So unlike
+ // the other anchors, which all skip all subsequent processing if found, with BOL we just use it
+ // to boost our position to the next line, and then continue normally with any searches.
+ if (LeadingAnchor == RegexPrefixAnalyzer.Bol)
+ {
+ // If we're not currently positioned at the beginning of a line (either
+ // the beginning of the string or just after a line feed), find the next
+ // newline and position just after it.
+ Debug.Assert(!_rightToLeft);
+ if (pos > beginning && text[pos - 1] != '\n')
+ {
+ int newline = text.IndexOf('\n', pos);
+ if (newline == -1 || newline + 1 > end)
+ {
+ pos = end;
+ return false;
+ }
+
+ pos = newline + 1;
+ }
+ }
+
+ switch (FindMode)
+ {
+ // There's an anchor. For some, we can simply compare against the current position.
+ // For others, we can jump to the relevant location.
+
+ case FindNextStartingPositionMode.LeadingAnchor_LeftToRight_Beginning:
+ if (pos > beginning)
+ {
+ pos = end;
+ return false;
+ }
+ return true;
+
+ case FindNextStartingPositionMode.LeadingAnchor_LeftToRight_Start:
+ if (pos > start)
+ {
+ pos = end;
+ return false;
+ }
+ return true;
+
+ case FindNextStartingPositionMode.LeadingAnchor_LeftToRight_EndZ:
+ if (pos < end - 1)
+ {
+ pos = end - 1;
+ }
+ return true;
+
+ case FindNextStartingPositionMode.LeadingAnchor_LeftToRight_End:
+ if (pos < end)
+ {
+ pos = end;
+ }
+ return true;
+
+ case FindNextStartingPositionMode.LeadingAnchor_RightToLeft_Beginning:
+ if (pos > beginning)
+ {
+ pos = beginning;
+ }
+ return true;
+
+ case FindNextStartingPositionMode.LeadingAnchor_RightToLeft_Start:
+ if (pos < start)
+ {
+ pos = beginning;
+ return false;
+ }
+ return true;
+
+ case FindNextStartingPositionMode.LeadingAnchor_RightToLeft_EndZ:
+ if (pos < end - 1 || (pos == end - 1 && text[pos] != '\n'))
+ {
+ pos = beginning;
+ return false;
+ }
+ return true;
+
+ case FindNextStartingPositionMode.LeadingAnchor_RightToLeft_End:
+ if (pos < end)
+ {
+ pos = beginning;
+ return false;
+ }
+ return true;
+
+ // There's a case-sensitive prefix. Search for it with ordinal IndexOf.
+
+ case FindNextStartingPositionMode.LeadingPrefix_LeftToRight_CaseSensitive:
+ {
+ int i = text.AsSpan(pos, end - pos).IndexOf(LeadingCaseSensitivePrefix.AsSpan());
+ if (i >= 0)
+ {
+ pos += i;
+ return true;
+ }
+
+ pos = end;
+ return false;
+ }
+
+ case FindNextStartingPositionMode.LeadingPrefix_RightToLeft_CaseSensitive:
+ {
+ int i = text.AsSpan(beginning, pos - beginning).LastIndexOf(LeadingCaseSensitivePrefix.AsSpan());
+ if (i >= 0)
+ {
+ pos = beginning + i + LeadingCaseSensitivePrefix.Length;
+ return true;
+ }
+
+ pos = beginning;
+ return false;
+ }
+
+ // There's a literal at the beginning of the pattern. Search for it.
+
+ case FindNextStartingPositionMode.LeadingLiteral_RightToLeft_CaseSensitive:
+ {
+ int i = text.AsSpan(beginning, pos - beginning).LastIndexOf(FixedDistanceLiteral.Literal);
+ if (i >= 0)
+ {
+ pos = beginning + i + 1;
+ return true;
+ }
+
+ pos = beginning;
+ return false;
+ }
+
+ case FindNextStartingPositionMode.LeadingLiteral_RightToLeft_CaseInsensitive:
+ {
+ char ch = FixedDistanceLiteral.Literal;
+ TextInfo ti = _textInfo;
+
+ ReadOnlySpan<char> span = text.AsSpan(beginning, pos - beginning);
+ for (int i = span.Length - 1; i >= 0; i--)
+ {
+ if (ti.ToLower(span[i]) == ch)
+ {
+ pos = beginning + i + 1;
+ return true;
+ }
+ }
+
+ pos = beginning;
+ return false;
+ }
+
+ // There's a set at the beginning of the pattern. Search for it.
+
+ case FindNextStartingPositionMode.LeadingSet_LeftToRight_CaseSensitive:
+ {
+ (char[]? chars, string set, _, _) = FixedDistanceSets![0];
+
+ ReadOnlySpan<char> span = text.AsSpan(pos, end - pos);
+ if (chars is not null)
+ {
+ int i = span.IndexOfAny(chars);
+ if (i >= 0)
+ {
+ pos += i;
+ return true;
+ }
+ }
+ else
+ {
+ ref uint[]? startingAsciiLookup = ref _asciiLookups![0];
+ for (int i = 0; i < span.Length; i++)
+ {
+ if (RegexCharClass.CharInClass(span[i], set, ref startingAsciiLookup))
+ {
+ pos += i;
+ return true;
+ }
+ }
+ }
+
+ pos = end;
+ return false;
+ }
+
+ case FindNextStartingPositionMode.LeadingSet_LeftToRight_CaseInsensitive:
+ {
+ ref uint[]? startingAsciiLookup = ref _asciiLookups![0];
+ string set = FixedDistanceSets![0].Set;
+ TextInfo ti = _textInfo;
+
+ ReadOnlySpan<char> span = text.AsSpan(pos, end - pos);
+ for (int i = 0; i < span.Length; i++)
+ {
+ if (RegexCharClass.CharInClass(ti.ToLower(span[i]), set, ref startingAsciiLookup))
+ {
+ pos += i;
+ return true;
+ }
+ }
+
+ pos = end;
+ return false;
+ }
+
+ case FindNextStartingPositionMode.LeadingSet_RightToLeft_CaseSensitive:
+ {
+ ref uint[]? startingAsciiLookup = ref _asciiLookups![0];
+ string set = FixedDistanceSets![0].Set;
+
+ ReadOnlySpan<char> span = text.AsSpan(beginning, pos - beginning);
+ for (int i = span.Length - 1; i >= 0; i--)
+ {
+ if (RegexCharClass.CharInClass(span[i], set, ref startingAsciiLookup))
+ {
+ pos = beginning + i + 1;
+ return true;
+ }
+ }
+
+ pos = beginning;
+ return false;
+ }
+
+ case FindNextStartingPositionMode.LeadingSet_RightToLeft_CaseInsensitive:
+ {
+ ref uint[]? startingAsciiLookup = ref _asciiLookups![0];
+ string set = FixedDistanceSets![0].Set;
+ TextInfo ti = _textInfo;
+
+ ReadOnlySpan<char> span = text.AsSpan(beginning, pos - beginning);
+ for (int i = span.Length - 1; i >= 0; i--)
+ {
+ if (RegexCharClass.CharInClass(ti.ToLower(span[i]), set, ref startingAsciiLookup))
+ {
+ pos = beginning + i + 1;
+ return true;
+ }
+ }
+
+ pos = beginning;
+ return false;
+ }
+
+ // There's a literal at a fixed offset from the beginning of the pattern. Search for it.
+
+ case FindNextStartingPositionMode.FixedLiteral_LeftToRight_CaseSensitive:
+ {
+ Debug.Assert(FixedDistanceLiteral.Distance <= _minRequiredLength);
+
+ int i = text.AsSpan(pos + FixedDistanceLiteral.Distance, end - pos - FixedDistanceLiteral.Distance).IndexOf(FixedDistanceLiteral.Literal);
+ if (i >= 0)
+ {
+ pos += i;
+ return true;
+ }
+
+ pos = end;
+ return false;
+ }
+
+ case FindNextStartingPositionMode.FixedLiteral_LeftToRight_CaseInsensitive:
+ {
+ Debug.Assert(FixedDistanceLiteral.Distance <= _minRequiredLength);
+
+ char ch = FixedDistanceLiteral.Literal;
+ TextInfo ti = _textInfo;
+
+ ReadOnlySpan<char> span = text.AsSpan(pos + FixedDistanceLiteral.Distance, end - pos - FixedDistanceLiteral.Distance);
+ for (int i = 0; i < span.Length; i++)
+ {
+ if (ti.ToLower(span[i]) == ch)
+ {
+ pos += i;
+ return true;
+ }
+ }
+
+ pos = end;
+ return false;
+ }
+
+ // There are one or more sets at fixed offsets from the start of the pattern.
+
+ case FindNextStartingPositionMode.FixedSets_LeftToRight_CaseSensitive:
+ {
+ List<(char[]? Chars, string Set, int Distance, bool CaseInsensitive)> sets = FixedDistanceSets!;
+ (char[]? primaryChars, string primarySet, int primaryDistance, _) = sets[0];
+ int endMinusRequiredLength = end - Math.Max(1, _minRequiredLength);
+
+ if (primaryChars is not null)
+ {
+ for (int inputPosition = pos; inputPosition <= endMinusRequiredLength; inputPosition++)
+ {
+ int offset = inputPosition + primaryDistance;
+ int index = text.IndexOfAny(primaryChars, offset, end - offset);
+ if (index < 0)
+ {
+ break;
+ }
+
+ inputPosition = index - primaryDistance;
+ if (inputPosition > endMinusRequiredLength)
+ {
+ break;
+ }
+
+ for (int i = 1; i < sets.Count; i++)
+ {
+ (_, string nextSet, int nextDistance, bool nextCaseInsensitive) = sets[i];
+ char c = text[inputPosition + nextDistance];
+ if (!RegexCharClass.CharInClass(nextCaseInsensitive ? _textInfo.ToLower(c) : c, nextSet, ref _asciiLookups![i]))
+ {
+ goto Bumpalong;
+ }
+ }
+
+ pos = inputPosition;
+ return true;
+
+ Bumpalong:;
+ }
+ }
+ else
+ {
+ ref uint[]? startingAsciiLookup = ref _asciiLookups![0];
+
+ for (int inputPosition = pos; inputPosition <= endMinusRequiredLength; inputPosition++)
+ {
+ char c = text[inputPosition + primaryDistance];
+ if (!RegexCharClass.CharInClass(c, primarySet, ref startingAsciiLookup))
+ {
+ goto Bumpalong;
+ }
+
+ for (int i = 1; i < sets.Count; i++)
+ {
+ (_, string nextSet, int nextDistance, bool nextCaseInsensitive) = sets[i];
+ c = text[inputPosition + nextDistance];
+ if (!RegexCharClass.CharInClass(nextCaseInsensitive ? _textInfo.ToLower(c) : c, nextSet, ref _asciiLookups![i]))
+ {
+ goto Bumpalong;
+ }
+ }
+
+ pos = inputPosition;
+ return true;
+
+ Bumpalong:;
+ }
+ }
+
+ pos = end;
+ return false;
+ }
+
+ case FindNextStartingPositionMode.FixedSets_LeftToRight_CaseInsensitive:
+ {
+ List<(char[]? Chars, string Set, int Distance, bool CaseInsensitive)> sets = FixedDistanceSets!;
+ (_, string primarySet, int primaryDistance, _) = sets[0];
+
+ int endMinusRequiredLength = end - Math.Max(1, _minRequiredLength);
+ TextInfo ti = _textInfo;
+ ref uint[]? startingAsciiLookup = ref _asciiLookups![0];
+
+ for (int inputPosition = pos; inputPosition <= endMinusRequiredLength; inputPosition++)
+ {
+ char c = text[inputPosition + primaryDistance];
+ if (!RegexCharClass.CharInClass(ti.ToLower(c), primarySet, ref startingAsciiLookup))
+ {
+ goto Bumpalong;
+ }
+
+ for (int i = 1; i < sets.Count; i++)
+ {
+ (_, string nextSet, int nextDistance, bool nextCaseInsensitive) = sets[i];
+ c = text[inputPosition + nextDistance];
+ if (!RegexCharClass.CharInClass(nextCaseInsensitive ? _textInfo.ToLower(c) : c, nextSet, ref _asciiLookups![i]))
+ {
+ goto Bumpalong;
+ }
+ }
+
+ pos = inputPosition;
+ return true;
+
+ Bumpalong:;
+ }
+
+ pos = end;
+ return false;
+ }
+
+ // Nothing special to look for. Just return true indicating this is a valid position to try to match.
+
+ default:
+ Debug.Assert(FindMode == FindNextStartingPositionMode.NoSearch);
+ return true;
+ }
+ }
+ }
+
+ /// <summary>Mode to use for searching for the next location of a possible match.</summary>
+ internal enum FindNextStartingPositionMode
+ {
+ /// <summary>A "beginning" anchor at the beginning of the pattern.</summary>
+ LeadingAnchor_LeftToRight_Beginning,
+ /// <summary>A "start" anchor at the beginning of the pattern.</summary>
+ LeadingAnchor_LeftToRight_Start,
+ /// <summary>An "endz" anchor at the beginning of the pattern. This is rare.</summary>
+ LeadingAnchor_LeftToRight_EndZ,
+ /// <summary>An "end" anchor at the beginning of the pattern. This is rare.</summary>
+ LeadingAnchor_LeftToRight_End,
+
+ /// <summary>A "beginning" anchor at the beginning of the right-to-left pattern.</summary>
+ LeadingAnchor_RightToLeft_Beginning,
+ /// <summary>A "start" anchor at the beginning of the right-to-left pattern.</summary>
+ LeadingAnchor_RightToLeft_Start,
+ /// <summary>An "endz" anchor at the beginning of the right-to-left pattern. This is rare.</summary>
+ LeadingAnchor_RightToLeft_EndZ,
+ /// <summary>An "end" anchor at the beginning of the right-to-left pattern. This is rare.</summary>
+ LeadingAnchor_RightToLeft_End,
+
+ /// <summary>A case-sensitive multi-character substring at the beginning of the pattern.</summary>
+ LeadingPrefix_LeftToRight_CaseSensitive,
+ /// <summary>A case-sensitive multi-character substring at the beginning of the right-to-left pattern.</summary>
+ LeadingPrefix_RightToLeft_CaseSensitive,
+
+ /// <summary>A case-sensitive set starting the pattern.</summary>
+ LeadingSet_LeftToRight_CaseSensitive,
+ /// <summary>A case-insensitive set starting the pattern.</summary>
+ LeadingSet_LeftToRight_CaseInsensitive,
+ /// <summary>A case-sensitive set starting the right-to-left pattern.</summary>
+ LeadingSet_RightToLeft_CaseSensitive,
+ /// <summary>A case-insensitive set starting the right-to-left pattern.</summary>
+ LeadingSet_RightToLeft_CaseInsensitive,
+
+ /// <summary>A case-sensitive single character at a fixed distance from the start of the right-to-left pattern.</summary>
+ LeadingLiteral_RightToLeft_CaseSensitive,
+ /// <summary>A case-insensitive single character at a fixed distance from the start of the right-to-left pattern.</summary>
+ LeadingLiteral_RightToLeft_CaseInsensitive,
+
+ /// <summary>A case-sensitive single character at a fixed distance from the start of the pattern.</summary>
+ FixedLiteral_LeftToRight_CaseSensitive,
+ /// <summary>A case-insensitive single character at a fixed distance from the start of the pattern.</summary>
+ FixedLiteral_LeftToRight_CaseInsensitive,
+
+ /// <summary>One or more sets at a fixed distance from the start of the pattern. At least the first set is case-sensitive.</summary>
+ FixedSets_LeftToRight_CaseSensitive,
+ /// <summary>One or more sets at a fixed distance from the start of the pattern. At least the first set is case-insensitive.</summary>
+ FixedSets_LeftToRight_CaseInsensitive,
+
+ /// <summary>Nothing to search for. Nop.</summary>
+ NoSearch,
+ }
+}
private readonly RegexCode _code;
private readonly TextInfo _textInfo;
- private readonly FindFirstCharMode _findFirstCharMode;
private int _operator;
private int _codepos;
_code = code;
_textInfo = culture.TextInfo;
-
- // Determine what searching mode FindFirstChar will employ.
- if ((_code.LeadingAnchor & (RegexPrefixAnalyzer.Beginning | RegexPrefixAnalyzer.Start | RegexPrefixAnalyzer.EndZ | RegexPrefixAnalyzer.End)) != 0)
- {
- _findFirstCharMode = (_code.LeadingAnchor, code.RightToLeft) switch
- {
- (RegexPrefixAnalyzer.Beginning, false) => FindFirstCharMode.LeadingAnchor_LeftToRight_Beginning,
- (RegexPrefixAnalyzer.Beginning, true) => FindFirstCharMode.LeadingAnchor_RightToLeft_Beginning,
- (RegexPrefixAnalyzer.Start, false) => FindFirstCharMode.LeadingAnchor_LeftToRight_Start,
- (RegexPrefixAnalyzer.Start, true) => FindFirstCharMode.LeadingAnchor_RightToLeft_Start,
- (RegexPrefixAnalyzer.End, false) => FindFirstCharMode.LeadingAnchor_LeftToRight_End,
- (RegexPrefixAnalyzer.End, true) => FindFirstCharMode.LeadingAnchor_RightToLeft_End,
- (_, false) => FindFirstCharMode.LeadingAnchor_LeftToRight_EndZ,
- (_, true) => FindFirstCharMode.LeadingAnchor_RightToLeft_EndZ,
- };
- }
- else if (code.BoyerMoorePrefix is RegexBoyerMoore rbm)
- {
- _findFirstCharMode = rbm.PatternSupportsIndexOf ?
- FindFirstCharMode.IndexOf :
- FindFirstCharMode.BoyerMoore;
- }
- else if (code.LeadingCharClasses is not null)
- {
- (string charClass, bool caseInsensitive) = code.LeadingCharClasses[0];
- bool isSet = !RegexCharClass.IsSingleton(charClass);
- _findFirstCharMode = (code.RightToLeft, caseInsensitive, isSet) switch
- {
- (false, false, false) => FindFirstCharMode.LeadingCharClass_LeftToRight_CaseSensitive_Singleton,
- (false, false, true) => FindFirstCharMode.LeadingCharClass_LeftToRight_CaseSensitive_Set,
- (false, true, false) => FindFirstCharMode.LeadingCharClass_LeftToRight_CaseInsensitive_Singleton,
- (false, true, true) => FindFirstCharMode.LeadingCharClass_LeftToRight_CaseInsensitive_Set,
- (true, false, false) => FindFirstCharMode.LeadingCharClass_RightToLeft_CaseSensitive_Singleton,
- (true, false, true) => FindFirstCharMode.LeadingCharClass_RightToLeft_CaseSensitive_Set,
- (true, true, false) => FindFirstCharMode.LeadingCharClass_RightToLeft_CaseInsensitive_Singleton,
- (true, true, true) => FindFirstCharMode.LeadingCharClass_RightToLeft_CaseInsensitive_Set,
- };
- }
- else
- {
- _findFirstCharMode = FindFirstCharMode.NoSearch;
- }
}
protected override void InitTrackCount() => runtrackcount = _code.TrackCount;
private void Backwardnext() => runtextpos += _rightToLeft ? 1 : -1;
- private enum FindFirstCharMode
- {
- LeadingAnchor_LeftToRight_Beginning,
- LeadingAnchor_LeftToRight_Start,
- LeadingAnchor_LeftToRight_EndZ,
- LeadingAnchor_LeftToRight_End,
-
- LeadingAnchor_RightToLeft_Beginning,
- LeadingAnchor_RightToLeft_Start,
- LeadingAnchor_RightToLeft_EndZ,
- LeadingAnchor_RightToLeft_End,
-
- IndexOf,
- BoyerMoore,
-
- LeadingCharClass_LeftToRight_CaseSensitive_Singleton,
- LeadingCharClass_LeftToRight_CaseSensitive_Set,
- LeadingCharClass_LeftToRight_CaseInsensitive_Singleton,
- LeadingCharClass_LeftToRight_CaseInsensitive_Set,
-
- LeadingCharClass_RightToLeft_CaseSensitive_Singleton,
- LeadingCharClass_RightToLeft_CaseSensitive_Set,
- LeadingCharClass_RightToLeft_CaseInsensitive_Singleton,
- LeadingCharClass_RightToLeft_CaseInsensitive_Set,
-
- NoSearch,
- }
-
- protected override bool FindFirstChar()
- {
- // Return early if we know there's not enough input left to match.
- if (!_code.RightToLeft)
- {
- if (runtextpos > runtextend - _code.Tree.MinRequiredLength)
- {
- runtextpos = runtextend;
- return false;
- }
- }
- else
- {
- if (runtextpos - _code.Tree.MinRequiredLength < runtextbeg)
- {
- runtextpos = runtextbeg;
- return false;
- }
- }
-
- // Optimize the handling of a Beginning-Of-Line (BOL) anchor. BOL is special, in that unlike
- // other anchors like Beginning, there are potentially multiple places a BOL can match. So unlike
- // the other anchors, which all skip all subsequent processing if found, with BOL we just use it
- // to boost our position to the next line, and then continue normally with any Boyer-Moore or
- // leading char class searches.
- if (_code.LeadingAnchor == RegexPrefixAnalyzer.Bol &&
- !_code.RightToLeft) // don't bother customizing this optimization for the very niche RTL + Multiline case
- {
- // If we're not currently positioned at the beginning of a line (either
- // the beginning of the string or just after a line feed), find the next
- // newline and position just after it.
- if (runtextpos > runtextbeg && runtext![runtextpos - 1] != '\n')
- {
- int newline = runtext.IndexOf('\n', runtextpos);
- if (newline == -1 || newline + 1 > runtextend)
- {
- runtextpos = runtextend;
- return false;
- }
-
- runtextpos = newline + 1;
- }
- }
-
- switch (_findFirstCharMode)
- {
- // If the pattern is anchored, we can update our position appropriately and return immediately.
- // If there's a Boyer-Moore prefix, we can also validate it.
-
- case FindFirstCharMode.LeadingAnchor_LeftToRight_Beginning:
- if (runtextpos > runtextbeg)
- {
- runtextpos = runtextend;
- return false;
- }
- return NoPrefixOrPrefixMatches();
-
- case FindFirstCharMode.LeadingAnchor_LeftToRight_Start:
- if (runtextpos > runtextstart)
- {
- runtextpos = runtextend;
- return false;
- }
- return NoPrefixOrPrefixMatches();
-
- case FindFirstCharMode.LeadingAnchor_LeftToRight_EndZ:
- if (runtextpos < runtextend - 1)
- {
- runtextpos = runtextend - 1;
- }
- return NoPrefixOrPrefixMatches();
-
- case FindFirstCharMode.LeadingAnchor_LeftToRight_End:
- if (runtextpos < runtextend)
- {
- runtextpos = runtextend;
- }
- return NoPrefixOrPrefixMatches();
-
- case FindFirstCharMode.LeadingAnchor_RightToLeft_Beginning:
- if (runtextpos > runtextbeg)
- {
- runtextpos = runtextbeg;
- }
- return NoPrefixOrPrefixMatches();
-
- case FindFirstCharMode.LeadingAnchor_RightToLeft_Start:
- if (runtextpos < runtextstart)
- {
- runtextpos = runtextbeg;
- return false;
- }
- return NoPrefixOrPrefixMatches();
-
- case FindFirstCharMode.LeadingAnchor_RightToLeft_EndZ:
- if (runtextpos < runtextend - 1 || (runtextpos == runtextend - 1 && runtext![runtextpos] != '\n'))
- {
- runtextpos = runtextbeg;
- return false;
- }
- return NoPrefixOrPrefixMatches();
-
- case FindFirstCharMode.LeadingAnchor_RightToLeft_End:
- if (runtextpos < runtextend)
- {
- runtextpos = runtextbeg;
- return false;
- }
- return NoPrefixOrPrefixMatches();
-
- // There was a prefix. Scan for it.
-
- case FindFirstCharMode.IndexOf:
- {
- int i = runtext.AsSpan(runtextpos, runtextend - runtextpos).IndexOf(_code.BoyerMoorePrefix!.Pattern);
- if (i >= 0)
- {
- runtextpos += i;
- return true;
- }
- runtextpos = runtextend;
- return false;
- }
-
- case FindFirstCharMode.BoyerMoore:
- runtextpos = _code.BoyerMoorePrefix!.Scan(runtext!, runtextpos, runtextbeg, runtextend);
- if (runtextpos >= 0)
- {
- return true;
- }
- runtextpos = _code.RightToLeft ? runtextbeg : runtextend;
- return false;
-
- // There's a leading character class. Search for it.
-
- case FindFirstCharMode.LeadingCharClass_LeftToRight_CaseSensitive_Singleton:
- {
- ReadOnlySpan<char> span = runtext.AsSpan(runtextpos, runtextend - runtextpos);
- int i = span.IndexOf(RegexCharClass.SingletonChar(_code.LeadingCharClasses![0].CharClass));
- if (i >= 0)
- {
- runtextpos += i;
- return true;
- }
- runtextpos = runtextend;
- return false;
- }
-
- case FindFirstCharMode.LeadingCharClass_LeftToRight_CaseSensitive_Set:
- {
- string set = _code.LeadingCharClasses![0].CharClass;
- ReadOnlySpan<char> span = runtext.AsSpan(runtextpos, runtextend - runtextpos);
- for (int i = 0; i < span.Length; i++)
- {
- if (RegexCharClass.CharInClass(span[i], set, ref _code.LeadingCharClassAsciiLookup))
- {
- runtextpos += i;
- return true;
- }
- }
- runtextpos = runtextend;
- return false;
- }
-
- case FindFirstCharMode.LeadingCharClass_LeftToRight_CaseInsensitive_Singleton:
- {
- char ch = RegexCharClass.SingletonChar(_code.LeadingCharClasses![0].CharClass);
- TextInfo ti = _textInfo;
- ReadOnlySpan<char> span = runtext.AsSpan(runtextpos, runtextend - runtextpos);
- for (int i = 0; i < span.Length; i++)
- {
- if (ch == ti.ToLower(span[i]))
- {
- runtextpos += i;
- return true;
- }
- }
- runtextpos = runtextend;
- return false;
- }
-
- case FindFirstCharMode.LeadingCharClass_LeftToRight_CaseInsensitive_Set:
- {
- string set = _code.LeadingCharClasses![0].CharClass;
- ReadOnlySpan<char> span = runtext.AsSpan(runtextpos, runtextend - runtextpos);
- TextInfo ti = _textInfo;
- for (int i = 0; i < span.Length; i++)
- {
- if (RegexCharClass.CharInClass(ti.ToLower(span[i]), set, ref _code.LeadingCharClassAsciiLookup))
- {
- runtextpos += i;
- return true;
- }
- }
- runtextpos = runtextend;
- return false;
- }
-
- case FindFirstCharMode.LeadingCharClass_RightToLeft_CaseSensitive_Singleton:
- {
- ReadOnlySpan<char> span = runtext.AsSpan(runtextbeg, runtextpos - runtextbeg);
- int i = span.LastIndexOf(RegexCharClass.SingletonChar(_code.LeadingCharClasses![0].CharClass));
- if (i >= 0)
- {
- runtextpos = runtextbeg + i + 1;
- return true;
- }
- runtextpos = runtextbeg;
- return false;
- }
-
- case FindFirstCharMode.LeadingCharClass_RightToLeft_CaseSensitive_Set:
- {
- string set = _code.LeadingCharClasses![0].CharClass;
- ReadOnlySpan<char> span = runtext.AsSpan(runtextbeg, runtextpos - runtextbeg);
- for (int i = span.Length - 1; i >= 0; i--)
- {
- if (RegexCharClass.CharInClass(span[i], set, ref _code.LeadingCharClassAsciiLookup))
- {
- runtextpos = runtextbeg + i + 1;
- return true;
- }
- }
- runtextpos = runtextbeg;
- return false;
- }
-
- case FindFirstCharMode.LeadingCharClass_RightToLeft_CaseInsensitive_Singleton:
- {
- char ch = RegexCharClass.SingletonChar(_code.LeadingCharClasses![0].CharClass);
- TextInfo ti = _textInfo;
- ReadOnlySpan<char> span = runtext.AsSpan(runtextbeg, runtextpos - runtextbeg);
- for (int i = span.Length - 1; i >= 0; i--)
- {
- if (ch == ti.ToLower(span[i]))
- {
- runtextpos = runtextbeg + i + 1;
- return true;
- }
- }
- runtextpos = runtextbeg;
- return false;
- }
-
- case FindFirstCharMode.LeadingCharClass_RightToLeft_CaseInsensitive_Set:
- {
- string set = _code.LeadingCharClasses![0].CharClass;
- ReadOnlySpan<char> span = runtext.AsSpan(runtextbeg, runtextpos - runtextbeg);
- TextInfo ti = _textInfo;
- for (int i = span.Length - 1; i >= 0; i--)
- {
- if (RegexCharClass.CharInClass(ti.ToLower(span[i]), set, ref _code.LeadingCharClassAsciiLookup))
- {
- runtextpos = runtextbeg + i + 1;
- return true;
- }
- }
- runtextpos = runtextbeg;
- return false;
- }
-
- // Nothing special to look for. Just return true indicating this is a valid position to try to match.
-
- default:
- Debug.Assert(_findFirstCharMode == FindFirstCharMode.NoSearch);
- return true;
- }
-
- bool NoPrefixOrPrefixMatches() =>
- _code.BoyerMoorePrefix is not RegexBoyerMoore rbm ||
- rbm.IsMatch(runtext!, runtextpos, runtextbeg, runtextend);
- }
+ protected override bool FindFirstChar() =>
+ _code.FindOptimizations.TryFindNextStartingPosition(runtext!, ref runtextpos, runtextbeg, runtextstart, runtextend);
protected override void Go()
{
int operand0 = Operand(0);
string set = _code.Strings[operand0];
- ref int[]? setLookup = ref _code.StringsAsciiLookup[operand0];
+ ref uint[]? setLookup = ref _code.StringsAsciiLookup[operand0];
while (c-- > 0)
{
int len = Math.Min(Operand(1), Forwardchars());
int operand0 = Operand(0);
string set = _code.Strings[operand0];
- ref int[]? setLookup = ref _code.StringsAsciiLookup[operand0];
+ ref uint[]? setLookup = ref _code.StringsAsciiLookup[operand0];
int i;
for (i = len; i > 0; i--)
_code = code;
_codes = code.Codes;
_strings = code.Strings;
- _leadingCharClasses = code.LeadingCharClasses;
- _boyerMoorePrefix = code.BoyerMoorePrefix;
- _leadingAnchor = code.LeadingAnchor;
_trackcount = code.TrackCount;
_options = options;
_hasTimeout = hasTimeout;
using System.Collections.Generic;
using System.Diagnostics;
using System.Diagnostics.CodeAnalysis;
+using System.Globalization;
using System.Threading;
namespace System.Text.RegularExpressions
N = n;
}
- public bool UseOptionR() => (Options & RegexOptions.RightToLeft) != 0;
+ /// <summary>Creates a RegexNode representing a single character.</summary>
+ /// <param name="ch">The character.</param>
+ /// <param name="options">The node's options.</param>
+ /// <param name="culture">The culture to use to perform any required transformations.</param>
+ /// <returns>The created RegexNode. This might be a RegexNode.One or a RegexNode.Set.</returns>
+ public static RegexNode CreateOneWithCaseConversion(char ch, RegexOptions options, CultureInfo? culture)
+ {
+ // If the options specify case-insensitivity, we try to create a node that fully encapsulates that.
+ if ((options & RegexOptions.IgnoreCase) != 0)
+ {
+ Debug.Assert(culture is not null);
+
+ // If the character is part of a Unicode category that doesn't participate in case conversion,
+ // we can simply strip out the IgnoreCase option and make the node case-sensitive.
+ if (!RegexCharClass.ParticipatesInCaseConversion(ch))
+ {
+ return new RegexNode(One, options & ~RegexOptions.IgnoreCase, ch);
+ }
+
+ // Create a set for the character, trying to include all case-insensitive equivalent characters.
+ // If it's successful in doing so, resultIsCaseInsensitive will be false and we can strip
+ // out RegexOptions.IgnoreCase as part of creating the set.
+ string stringSet = RegexCharClass.OneToStringClass(ch, culture, out bool resultIsCaseInsensitive);
+ if (!resultIsCaseInsensitive)
+ {
+ return new RegexNode(Set, options & ~RegexOptions.IgnoreCase, stringSet);
+ }
+
+ // Otherwise, until we can get rid of ToLower usage at match time entirely (https://github.com/dotnet/runtime/issues/61048),
+ // lowercase the character and proceed to create an IgnoreCase One node.
+ ch = culture.TextInfo.ToLower(ch);
+ }
+
+ // Create a One node for the character.
+ return new RegexNode(One, options, ch);
+ }
- public RegexNode ReverseLeft()
+ /// <summary>Reverses all children of a concatenation when in RightToLeft mode.</summary>
+ public RegexNode ReverseConcatenationIfRightToLeft()
{
- if (UseOptionR() && Type == Concatenate && ChildCount() > 1)
+ if ((Options & RegexOptions.RightToLeft) != 0 &&
+ Type == Concatenate &&
+ ChildCount() > 1)
{
((List<RegexNode>)Children!).Reverse();
}
{
RegexNode node = toExamine.Pop();
+ // Add all children to be examined
+ int childCount = node.ChildCount();
+ for (int i = 0; i < childCount; i++)
+ {
+ RegexNode child = node.Child(i);
+ Debug.Assert(child.Next == node, $"{child.Description()} missing reference to parent {node.Description()}");
+
+ toExamine.Push(child);
+ }
+
// Validate that we never see certain node types.
Debug.Assert(Type != Group, "All Group nodes should have been removed.");
- // Validate expected child counts.
- int childCount = node.ChildCount();
+ // Validate node types and expected child counts.
switch (node.Type)
{
+ case Group:
+ Debug.Fail("All Group nodes should have been removed.");
+ break;
+
case Beginning:
case Bol:
case Boundary:
case Prevent:
case Require:
Debug.Assert(childCount == 1, $"Expected one and only one child for {node.TypeName}, got {childCount}.");
- toExamine.Push(node.Child(0));
break;
case Testref:
case Testgroup:
Debug.Assert(childCount >= 1, $"Expected at least one child for {node.TypeName}, got {childCount}.");
- for (int i = 0; i < childCount; i++)
- {
- toExamine.Push(node.Child(i));
- }
break;
case Concatenate:
case Alternate:
Debug.Assert(childCount >= 2, $"Expected at least two children for {node.TypeName}, got {childCount}.");
- for (int i = 0; i < childCount; i++)
- {
- toExamine.Push(node.Child(i));
- }
+ break;
+
+ default:
+ Debug.Fail($"Unexpected node type: {node.Type}");
break;
}
switch (node.Type)
{
case Multi:
+ Debug.Assert(node.Str is not null, "Expect non-null multi string");
+ Debug.Assert(node.Str.Length >= 2, $"Expected {node.Str} to be at least two characters");
+ break;
+
case Set:
case Setloop:
case Setloopatomic:
default:
ReduceSingleLetterAndNestedAlternations();
- RegexNode newThis = ReplaceNodeIfUnnecessary(Nothing);
- return newThis != this ? newThis : ExtractCommonPrefixes();
+ RegexNode node = ReplaceNodeIfUnnecessary(Nothing);
+ node = ExtractCommonPrefixText(node);
+ node = ExtractCommonPrefixOneNotoneSet(node);
+ return node;
}
// This function performs two optimizations:
break;
}
-
// The last node was a Set or a One, we're a Set or One and our options are the same.
// Merge the two nodes.
j--;
prev.Type = Set;
prev.Str = prevCharClass.ToStringClass(Options);
+ if ((prev.Options & RegexOptions.IgnoreCase) != 0 &&
+ RegexCharClass.MakeCaseSensitiveIfPossible(prev.Str, RegexParser.GetTargetCulture(prev.Options)) is string newSetString)
+ {
+ prev.Str = newSetString;
+ prev.Options &= ~RegexOptions.IgnoreCase;
+ }
}
else if (at.Type == Nothing)
{
}
}
+ // This function optimizes out prefix nodes from alternation branches that are
+ // the same across multiple contiguous branches.
+ // e.g. \w12|\d34|\d56|\w78|\w90 => \w12|\d(?:34|56)|\w(?:78|90)
+ static RegexNode ExtractCommonPrefixOneNotoneSet(RegexNode alternation)
+ {
+ if (alternation.Type != Alternate)
+ {
+ return alternation;
+ }
+
+ Debug.Assert(alternation.Children is List<RegexNode> { Count: >= 2 });
+ var children = (List<RegexNode>)alternation.Children;
+
+ // Only process left-to-right prefixes.
+ if ((alternation.Options & RegexOptions.RightToLeft) != 0)
+ {
+ return alternation;
+ }
+
+ // Only handle the case where each branch is a concatenation
+ foreach (RegexNode child in children)
+ {
+ if (child.Type != Concatenate || child.ChildCount() < 2)
+ {
+ return alternation;
+ }
+ }
+
+ for (int startingIndex = 0; startingIndex < children.Count - 1; startingIndex++)
+ {
+ Debug.Assert(children[startingIndex].Children is List<RegexNode> { Count: >= 2 });
+
+ // Only handle the case where each branch begins with the same One, Notone, or Set (individual or loop).
+ // Note that while we can do this for individual characters, fixed length loops, and atomic loops, doing
+ // it for non-atomic variable length loops could change behavior as each branch could otherwise have a
+ // different number of characters consumed by the loop based on what's after it.
+ RegexNode required = children[startingIndex].Child(0);
+ switch (required.Type)
+ {
+ case One or Notone or Set:
+ case Oneloopatomic or Notoneloopatomic or Setloopatomic:
+ case Oneloop or Notoneloop or Setloop or Onelazy or Notonelazy or Setlazy when required.M == required.N:
+ break;
+
+ default:
+ continue;
+ }
+
+ // Only handle the case where each branch begins with the exact same node value
+ int endingIndex = startingIndex + 1;
+ for (; endingIndex < children.Count; endingIndex++)
+ {
+ RegexNode other = children[endingIndex].Child(0);
+ if (required.Type != other.Type ||
+ required.Options != other.Options ||
+ required.M != other.M ||
+ required.N != other.N ||
+ required.Ch != other.Ch ||
+ required.Str != other.Str)
+ {
+ break;
+ }
+ }
+
+ if (endingIndex - startingIndex <= 1)
+ {
+ // Nothing to extract from this starting index.
+ continue;
+ }
+
+ // Remove the prefix node from every branch, adding it to a new alternation
+ var newAlternate = new RegexNode(Alternate, alternation.Options);
+ for (int i = startingIndex; i < endingIndex; i++)
+ {
+ ((List<RegexNode>)children[i].Children!).RemoveAt(0);
+ newAlternate.AddChild(children[i]);
+ }
+
+ // If this alternation is wrapped as atomic, we need to do the same for the new alternation.
+ if (alternation.Next is RegexNode parent && parent.Type == Atomic)
+ {
+ var atomic = new RegexNode(Atomic, alternation.Options);
+ atomic.AddChild(newAlternate);
+ newAlternate = atomic;
+ }
+
+ // Now create a concatenation of the prefix node with the new alternation for the combined
+ // branches, and replace all of the branches in this alternation with that new concatenation.
+ var newConcat = new RegexNode(Concatenate, alternation.Options);
+ newConcat.AddChild(required);
+ newConcat.AddChild(newAlternate);
+ alternation.ReplaceChild(startingIndex, newConcat);
+ children.RemoveRange(startingIndex + 1, endingIndex - startingIndex - 1);
+ }
+
+ // If we've reduced this alternation to just a single branch, return it.
+ // Otherwise, return the alternation.
+ return alternation.ChildCount() == 1 ? alternation.Child(0) : alternation;
+ }
+
// Analyzes all the branches of the alternation for text that's identical at the beginning
// of every branch. That text is then pulled out into its own one or multi node in a
// concatenation with the alternation (whose branches are updated to remove that prefix).
// by sets that can be merged. Third, it reduces the amount of duplicated comparisons required
// if we end up backtracking into subsequent branches.
// e.g. abc|ade => a(?bc|de)
- RegexNode ExtractCommonPrefixes()
+ static RegexNode ExtractCommonPrefixText(RegexNode alternation)
{
+ if (alternation.Type != Alternate)
+ {
+ return alternation;
+ }
+
+ Debug.Assert(alternation.Children is List<RegexNode> { Count: >= 2 });
+ var children = (List<RegexNode>)alternation.Children;
+
// To keep things relatively simple, we currently only handle:
// - Left to right (e.g. we don't process alternations in lookbehinds)
// - Branches that are one or multi nodes, or that are concatenations beginning with one or multi nodes.
// - All branches having the same options.
- // - Text, rather than also trying to combine identical sets that start each branch.
-
- Debug.Assert(Children is List<RegexNode>);
- var children = (List<RegexNode>)Children;
- Debug.Assert(children.Count >= 2);
// Only extract left-to-right prefixes.
- if ((Options & RegexOptions.RightToLeft) != 0)
+ if ((alternation.Options & RegexOptions.RightToLeft) != 0)
{
- return this;
+ return alternation;
}
Span<char> scratchChar = stackalloc char[1];
RegexNode? startingNode = children[startingIndex].FindBranchOneOrMultiStart();
if (startingNode is null)
{
- return this;
+ return alternation;
}
RegexOptions startingNodeOptions = startingNode.Options;
}
}
- if (Next is RegexNode parent && parent.Type == Atomic)
+ if (alternation.Next is RegexNode parent && parent.Type == Atomic)
{
var atomic = new RegexNode(Atomic, startingNodeOptions);
atomic.AddChild(newAlternate);
var newConcat = new RegexNode(Concatenate, startingNodeOptions);
newConcat.AddChild(prefix);
newConcat.AddChild(newAlternate);
- ReplaceChild(startingIndex, newConcat);
+ alternation.ReplaceChild(startingIndex, newConcat);
children.RemoveRange(startingIndex + 1, endingIndex - startingIndex - 1);
}
- return ChildCount() == 1 ? Child(0) : this;
+ return alternation.ChildCount() == 1 ? alternation.Child(0) : alternation;
}
}
{
}
+ /// <summary>Gets the culture to use based on the specified options.</summary>
+ internal static CultureInfo GetTargetCulture(RegexOptions options) =>
+ (options & RegexOptions.CultureInvariant) != 0 ? CultureInfo.InvariantCulture : CultureInfo.CurrentCulture;
+
public static RegexTree Parse(string pattern, RegexOptions options, CultureInfo culture)
{
var parser = new RegexParser(pattern, options, culture, stackalloc int[OptionStackDefaultSize]);
goto ContinueOuterScan;
case '[':
- AddUnitSet(ScanCharClass(UseOptionI(), scanOnly: false)!.ToStringClass(_options));
+ {
+ string setString = ScanCharClass(UseOptionI(), scanOnly: false)!.ToStringClass(_options);
+ _unit = UseOptionI() && RegexCharClass.MakeCaseSensitiveIfPossible(setString, _culture) is string newSetString ?
+ new RegexNode(RegexNode.Set, _options & ~RegexOptions.IgnoreCase, newSetString) :
+ new RegexNode(RegexNode.Set, _options, setString);
+ }
break;
case '(':
break;
case '.':
- if (UseOptionS())
- {
- AddUnitSet(RegexCharClass.AnyClass);
- }
- else
- {
- AddUnitNotone('\n');
- }
+ _unit = UseOptionS() ?
+ new RegexNode(RegexNode.Set, _options & ~RegexOptions.IgnoreCase, RegexCharClass.AnyClass) :
+ new RegexNode(RegexNode.Notone, _options & ~RegexOptions.IgnoreCase, '\n');
break;
case '{':
{
// we aren't in a range, and now there is a subtraction. Usually this happens
// only when a subtraction follows a range, like [a-z-[b]]
+ MoveRight();
+ RegexCharClass? rcc = ScanCharClass(caseInsensitive, scanOnly);
if (!scanOnly)
{
- MoveRight(1);
- charClass!.AddSubtraction(ScanCharClass(caseInsensitive, scanOnly)!);
+ charClass!.AddSubtraction(rcc!);
if (CharsRight() > 0 && RightChar() != ']')
{
throw MakeException(RegexParseError.ExclusionGroupNotLast, SR.ExclusionGroupNotLast);
}
}
- else
- {
- MoveRight(1);
- ScanCharClass(caseInsensitive, scanOnly);
- }
}
else
{
case 'w':
MoveRight();
return scanOnly ? null :
- new RegexNode(RegexNode.Set, _options, UseOptionE() ? RegexCharClass.ECMAWordClass : RegexCharClass.WordClass);
+ new RegexNode(RegexNode.Set, RemoveIgnoreCaseIfNotEcma(_options), UseOptionE() ? RegexCharClass.ECMAWordClass : RegexCharClass.WordClass);
case 'W':
MoveRight();
return scanOnly ? null :
- new RegexNode(RegexNode.Set, _options, UseOptionE() ? RegexCharClass.NotECMAWordClass : RegexCharClass.NotWordClass);
+ new RegexNode(RegexNode.Set, RemoveIgnoreCaseIfNotEcma(_options), UseOptionE() ? RegexCharClass.NotECMAWordClass : RegexCharClass.NotWordClass);
case 's':
MoveRight();
return scanOnly ? null :
- new RegexNode(RegexNode.Set, _options, UseOptionE() ? RegexCharClass.ECMASpaceClass : RegexCharClass.SpaceClass);
+ new RegexNode(RegexNode.Set, RemoveIgnoreCaseIfNotEcma(_options), UseOptionE() ? RegexCharClass.ECMASpaceClass : RegexCharClass.SpaceClass);
case 'S':
MoveRight();
return scanOnly ? null :
- new RegexNode(RegexNode.Set, _options, UseOptionE() ? RegexCharClass.NotECMASpaceClass : RegexCharClass.NotSpaceClass);
+ new RegexNode(RegexNode.Set, RemoveIgnoreCaseIfNotEcma(_options), UseOptionE() ? RegexCharClass.NotECMASpaceClass : RegexCharClass.NotSpaceClass);
case 'd':
MoveRight();
return scanOnly ? null :
- new RegexNode(RegexNode.Set, _options, UseOptionE() ? RegexCharClass.ECMADigitClass : RegexCharClass.DigitClass);
+ new RegexNode(RegexNode.Set, RemoveIgnoreCaseIfNotEcma(_options), UseOptionE() ? RegexCharClass.ECMADigitClass : RegexCharClass.DigitClass);
case 'D':
MoveRight();
return scanOnly ? null :
- new RegexNode(RegexNode.Set, _options, UseOptionE() ? RegexCharClass.NotECMADigitClass : RegexCharClass.NotDigitClass);
+ new RegexNode(RegexNode.Set, RemoveIgnoreCaseIfNotEcma(_options), UseOptionE() ? RegexCharClass.NotECMADigitClass : RegexCharClass.NotDigitClass);
case 'p':
case 'P':
default:
return ScanBasicBackslash(scanOnly);
}
+
+ static RegexOptions RemoveIgnoreCaseIfNotEcma(RegexOptions options)
+ {
+ // This function is used for \w, \W, \d, \D, \s, and \S to remove IgnoreCase,
+ // since they already include the notion of casing in their definitions.
+ // However, for compatibility, if ECMAScript is specified, we avoid stripping
+ // out the IgnoreCase. We should revisit this as part of https://github.com/dotnet/runtime/issues/61048,
+ // as it seems wrong that specifying ECMAScript (which implies non-Unicode) would
+ // then still involve lowercasing potentially Unicode character inputs to match
+ // against these sets.
+ if ((options & RegexOptions.ECMAScript) == 0)
+ {
+ options &= ~RegexOptions.IgnoreCase;
+ }
+ return options;
+ }
}
/// <summary>Scans \-style backreferences and character escapes</summary>
Textto(backpos);
ch = ScanCharEscape();
- if (UseOptionI())
- {
- ch = _culture.TextInfo.ToLower(ch);
- }
-
- return scanOnly ? null : new RegexNode(RegexNode.One, _options, ch);
+ return !scanOnly ?
+ RegexNode.CreateOneWithCaseConversion(ch, _options, _culture) :
+ null;
}
/*
{
if (CharsRight() == 0)
{
- return new RegexNode(RegexNode.One, _options, '$');
+ return RegexNode.CreateOneWithCaseConversion('$', _options, _culture);
}
char ch = RightChar();
{
case '$':
MoveRight();
- return new RegexNode(RegexNode.One, _options, '$');
+ return RegexNode.CreateOneWithCaseConversion('$', _options, _culture);
case '&':
capnum = 0;
// unrecognized $: literalize
Textto(backpos);
- return new RegexNode(RegexNode.One, _options, '$');
+ return RegexNode.CreateOneWithCaseConversion('$', _options, _culture);
}
/// <summary>Throws on unsupported capture references for NonBacktracking in replacement patterns.</summary>
/// <summary>Add a string to the last concatenate.</summary>
private void AddConcatenate(int pos, int cch, bool isReplacement)
{
- if (cch == 0)
+ switch (cch)
{
- return;
- }
+ case 0:
+ return;
- RegexNode node;
- if (cch > 1)
- {
- string str = UseOptionI() && !isReplacement ?
-#if REGEXGENERATOR
- StringExtensions.Create
-#else
- string.Create
-#endif
- (cch, (_pattern, _culture, pos, cch), static (dest, state) =>
- {
- // We do the ToLower character-by character for consistency with the rest of the implementation.
- // With surrogate pairs, doing a ToLower on the entire string is more correct linguistically, but
- // Regex doesn't support surrogates, and not doing this character-by-character then causes differences
- // from matching where characters are lowercased individually.
- ReadOnlySpan<char> src = state._pattern.AsSpan(state.pos, state.cch);
- TextInfo ti = state._culture.TextInfo;
- for (int i = 0; i < dest.Length; i++)
- {
- dest[i] = ti.ToLower(src[i]);
- }
- }) :
- _pattern.Substring(pos, cch);
-
- node = new RegexNode(RegexNode.Multi, _options, str);
- }
- else
- {
- char ch = _pattern[pos];
+ case 1:
+ _concatenation!.AddChild(RegexNode.CreateOneWithCaseConversion(_pattern[pos], isReplacement ? _options & ~RegexOptions.IgnoreCase : _options, _culture));
+ break;
- if (UseOptionI() && !isReplacement)
- {
- ch = _culture.TextInfo.ToLower(ch);
- }
+ case > 1 when !UseOptionI() || isReplacement:
+ _concatenation!.AddChild(new RegexNode(RegexNode.Multi, _options & ~RegexOptions.IgnoreCase, _pattern.Substring(pos, cch)));
+ break;
- node = new RegexNode(RegexNode.One, _options, ch);
+ default:
+ foreach (char c in _pattern.AsSpan(pos, cch))
+ {
+ _concatenation!.AddChild(RegexNode.CreateOneWithCaseConversion(c, _options, _culture));
+ }
+ break;
}
-
- _concatenation!.AddChild(node);
}
/// <summary>Push the parser state (in response to an open paren)</summary>
if (_group!.Type == RegexNode.Testgroup || _group.Type == RegexNode.Testref)
{
- _group.AddChild(_concatenation!.ReverseLeft());
+ _group.AddChild(_concatenation!.ReverseConcatenationIfRightToLeft());
}
else
{
- _alternation!.AddChild(_concatenation!.ReverseLeft());
+ _alternation!.AddChild(_concatenation!.ReverseConcatenationIfRightToLeft());
}
_concatenation = new RegexNode(RegexNode.Concatenate, _options);
private RegexNode? Unit() => _unit;
/// <summary>Sets the current unit to a single char node</summary>
- private void AddUnitOne(char ch)
- {
- if (UseOptionI())
- {
- ch = _culture.TextInfo.ToLower(ch);
- }
-
- _unit = new RegexNode(RegexNode.One, _options, ch);
- }
-
- /// <summary>Sets the current unit to a single inverse-char node</summary>
- private void AddUnitNotone(char ch)
- {
- if (UseOptionI())
- {
- ch = _culture.TextInfo.ToLower(ch);
- }
-
- _unit = new RegexNode(RegexNode.Notone, _options, ch);
- }
-
- /// <summary>Sets the current unit to a single set node</summary>
- private void AddUnitSet(string cc) => _unit = new RegexNode(RegexNode.Set, _options, cc);
+ private void AddUnitOne(char ch) => _unit = RegexNode.CreateOneWithCaseConversion(ch, _options, _culture);
/// <summary>Sets the current unit to a subtree</summary>
private void AddUnitNode(RegexNode node) => _unit = node;
{
if (_group!.Type == RegexNode.Testgroup || _group.Type == RegexNode.Testref)
{
- _group.AddChild(_concatenation!.ReverseLeft());
+ _group.AddChild(_concatenation!.ReverseConcatenationIfRightToLeft());
if (_group.Type == RegexNode.Testref && _group.ChildCount() > 2 || _group.ChildCount() > 3)
{
}
else
{
- _alternation!.AddChild(_concatenation!.ReverseLeft());
+ _alternation!.AddChild(_concatenation!.ReverseConcatenationIfRightToLeft());
_group.AddChild(_alternation);
}
using System.Diagnostics;
using System.Diagnostics.CodeAnalysis;
using System.Globalization;
+using System.Runtime.CompilerServices;
+using System.Threading;
namespace System.Text.RegularExpressions
{
_skipAllChildren = false;
}
- /// <summary>Computes the leading substring in <paramref name="tree"/>.</summary>
- /// <remarks>It's quite trivial and gives up easily, in which case an empty string is returned.</remarks>
- public static (string Prefix, bool CaseInsensitive) ComputeLeadingSubstring(RegexTree tree)
+ /// <summary>Computes the leading substring in <paramref name="tree"/>; may be empty.</summary>
+ public static string FindCaseSensitivePrefix(RegexTree tree)
{
- RegexNode curNode = tree.Root;
- RegexNode? concatNode = null;
- int nextChild = 0;
+ var vsb = new ValueStringBuilder(stackalloc char[64]);
+ Process(tree.Root, ref vsb);
+ return vsb.ToString();
- while (true)
+ // Processes the node, adding any prefix text to the builder.
+ // Returns whether processing should continue with subsequent nodes.
+ static bool Process(RegexNode node, ref ValueStringBuilder vsb)
{
- switch (curNode.Type)
+ if (!StackHelper.TryEnsureSufficientExecutionStack())
{
+ // If we're too deep on the stack, just give up finding any more prefix.
+ return false;
+ }
+
+ // We don't bother to handle reversed input, so process at most one node
+ // when handling RightToLeft.
+ bool rtl = (node.Options & RegexOptions.RightToLeft) != 0;
+
+ switch (node.Type)
+ {
+ // Concatenation
case RegexNode.Concatenate:
- if (curNode.ChildCount() > 0)
{
- concatNode = curNode;
- nextChild = 0;
+ int childCount = node.ChildCount();
+ for (int i = 0; i < childCount; i++)
+ {
+ if (!Process(node.Child(i), ref vsb))
+ {
+ return false;
+ }
+ }
+ return !rtl;
}
- break;
- case RegexNode.Atomic:
- case RegexNode.Capture:
- curNode = curNode.Child(0);
- concatNode = null;
- continue;
+ // Alternation: find a string that's a shared prefix of all branches
+ case RegexNode.Alternate:
+ {
+ int childCount = node.ChildCount();
- case RegexNode.Oneloop:
- case RegexNode.Oneloopatomic:
- case RegexNode.Onelazy:
+ // Store the initial branch into the target builder
+ int initialLength = vsb.Length;
+ bool keepExploring = Process(node.Child(0), ref vsb);
+ int addedLength = vsb.Length - initialLength;
- // In release, cutoff at a length to which we can still reasonably construct a string and Boyer-Moore search.
- // In debug, use a smaller cutoff to exercise the cutoff path in tests
- const int Cutoff =
-#if DEBUG
- 50;
-#else
- RegexBoyerMoore.MaxLimit;
-#endif
+ // Then explore the rest of the branches, finding the length
+ // a prefix they all share in common with the initial branch.
+ if (addedLength != 0)
+ {
+ var alternateSb = new ValueStringBuilder(64);
- if (curNode.M > 0 && curNode.M < Cutoff)
- {
- return (new string(curNode.Ch, curNode.M), (curNode.Options & RegexOptions.IgnoreCase) != 0);
- }
+ // Process each branch. If we reach a point where we've proven there's
+ // no overlap, we can bail early.
+ for (int i = 1; i < childCount && addedLength != 0; i++)
+ {
+ alternateSb.Length = 0;
+
+ // Process the branch. We want to keep exploring after this alternation,
+ // but we can't if either this branch doesn't allow for it or if the prefix
+ // supplied by this branch doesn't entirely match all the previous ones.
+ keepExploring &= Process(node.Child(i), ref alternateSb);
+ keepExploring &= alternateSb.Length == addedLength;
+
+ addedLength = Math.Min(addedLength, alternateSb.Length);
+ for (int j = 0; j < addedLength; j++)
+ {
+ if (vsb[initialLength + j] != alternateSb[j])
+ {
+ addedLength = j;
+ keepExploring = false;
+ break;
+ }
+ }
+ }
- return (string.Empty, false);
+ alternateSb.Dispose();
- case RegexNode.One:
- return (curNode.Ch.ToString(), (curNode.Options & RegexOptions.IgnoreCase) != 0);
+ // Then cull back on what was added based on the other branches.
+ vsb.Length = initialLength + addedLength;
+ }
- case RegexNode.Multi:
- return (curNode.Str!, (curNode.Options & RegexOptions.IgnoreCase) != 0);
+ return !rtl && keepExploring;
+ }
+ // One character
+ case RegexNode.One when (node.Options & RegexOptions.IgnoreCase) == 0:
+ vsb.Append(node.Ch);
+ return !rtl;
+
+ // Multiple characters
+ case RegexNode.Multi when (node.Options & RegexOptions.IgnoreCase) == 0:
+ vsb.Append(node.Str);
+ return !rtl;
+
+ // Loop of one character
+ case RegexNode.Oneloop or RegexNode.Oneloopatomic or RegexNode.Onelazy when node.M > 0 && (node.Options & RegexOptions.IgnoreCase) == 0:
+ const int SingleCharIterationLimit = 32; // arbitrary cut-off to avoid creating super long strings unnecessarily
+ int count = Math.Min(node.M, SingleCharIterationLimit);
+ vsb.Append(node.Ch, count);
+ return count == node.N && !rtl;
+
+ // Loop of a node
+ case RegexNode.Loop or RegexNode.Lazyloop when node.M > 0:
+ {
+ const int NodeIterationLimit = 4; // arbitrary cut-off to avoid creating super long strings unnecessarily
+ int limit = Math.Min(node.M, NodeIterationLimit);
+ for (int i = 0; i < limit; i++)
+ {
+ if (!Process(node.Child(0), ref vsb))
+ {
+ return false;
+ }
+ }
+ return limit == node.N && !rtl;
+ }
+
+ // Grouping nodes for which we only care about their single child
+ case RegexNode.Atomic:
+ case RegexNode.Capture:
+ return Process(node.Child(0), ref vsb);
+
+ // Zero-width anchors and assertions
case RegexNode.Bol:
case RegexNode.Eol:
case RegexNode.Boundary:
case RegexNode.ECMABoundary:
+ case RegexNode.NonBoundary:
+ case RegexNode.NonECMABoundary:
case RegexNode.Beginning:
case RegexNode.Start:
case RegexNode.EndZ:
case RegexNode.End:
case RegexNode.Empty:
+ case RegexNode.UpdateBumpalong:
case RegexNode.Require:
case RegexNode.Prevent:
- break;
+ return true;
+ // Give up for anything else
default:
- return (string.Empty, false);
+ return false;
}
-
- if (concatNode == null || nextChild >= concatNode.ChildCount())
- {
- return (string.Empty, false);
- }
-
- curNode = concatNode.Child(nextChild++);
}
}
- /// <summary>Computes a character class for the first character in <paramref name="tree"/>.</summary>
- /// <remarks>true if a character class could be computed; otherwise, false.</remarks>
- public static (string CharClass, bool CaseInsensitive)[]? ComputeFirstCharClass(RegexTree tree)
+ /// <summary>Finds sets at fixed-offsets from the beginning of the pattern/</summary>
+ /// <param name="tree">The RegexNode tree.</param>
+ /// <param name="culture">The culture to use for any case conversions.</param>
+ /// <param name="thorough">true to spend more time finding sets (e.g. through alternations); false to do a faster analysis that's potentially more incomplete.</param>
+ /// <returns>The array of found sets, or null if there aren't any.</returns>
+ public static List<(char[]? Chars, string Set, int Distance, bool CaseInsensitive)>? FindFixedDistanceSets(
+ RegexTree tree, CultureInfo culture, bool thorough)
{
- var s = new RegexPrefixAnalyzer(stackalloc int[StackBufferSize]);
- RegexFC? fc = s.RegexFCFromRegexTree(tree);
- s.Dispose();
+ const int MaxLoopExpansion = 20; // arbitrary cut-off to avoid loops adding significant overhead to processing
+ const int MaxFixedResults = 50; // arbitrary cut-off to avoid generating lots of sets unnecessarily
- if (fc == null || fc._nullable)
+ // Find all fixed-distance sets.
+ var results = new List<(char[]? Chars, string Set, int Distance, bool CaseInsensitive)>();
+ int distance = 0;
+ TryFindFixedSets(tree.Root, results, ref distance, culture, thorough);
+#if DEBUG
+ foreach ((char[]? Chars, string Set, int Distance, bool CaseInsensitive) result in results)
{
- return null;
+ Debug.Assert(result.Distance <= tree.MinRequiredLength, $"Min: {tree.MinRequiredLength}, Distance: {result.Distance}, Tree: {tree}");
}
+#endif
- if (fc.CaseInsensitive)
+ // Remove any sets that match everything; they're not helpful. (This check exists primarily to weed
+ // out use of . in Singleline mode.)
+ bool hasAny = false;
+ for (int i = 0; i < results.Count; i++)
{
- fc.AddLowercase(((tree.Options & RegexOptions.CultureInvariant) != 0) ? CultureInfo.InvariantCulture : CultureInfo.CurrentCulture);
+ if (results[i].Set == RegexCharClass.AnyClass)
+ {
+ hasAny = true;
+ break;
+ }
}
-
- return new[] { (fc.GetFirstChars(), fc.CaseInsensitive) };
- }
-
- /// <summary>Computes character classes for the first <paramref name="maxChars"/> characters in <paramref name="tree"/>.</summary>
- /// <remarks>
- /// For example, given "hello|world" and a <paramref name="maxChars"/> of 3, this will compute the sets [hw], [eo], and [lr].
- /// As with some of the other computations, it's quite trivial and gives up easily; for example, we could in
- /// theory handle nodes in a concatenation after an alternation, but we look only at the branches of the
- /// alternation itself. As this computation is intended primarily to handle global alternations, it's currently
- /// a reasonable tradeoff between simplicity, performance, and the fullness of potential optimizations.
- /// </remarks>
- public static (string CharClass, bool CaseInsensitive)[]? ComputeMultipleCharClasses(RegexTree tree, int maxChars)
- {
- Debug.Assert(maxChars > 1);
-
- if ((tree.Options & RegexOptions.RightToLeft) != 0)
+ if (hasAny)
{
- // We don't bother for RightToLeft. It's rare and adds non-trivial complication.
- return null;
+ results.RemoveAll(s => s.Set == RegexCharClass.AnyClass);
}
- // The known minimum required length will have already factored in knowledge about alternations.
- // If the known min length is less than the maximum number of chars requested, we can
- // cut this short. If it's zero, there's nothing to be found. If it's one, we won't do
- // any better than ComputeFirstCharClass (and likely worse). Otherwise, don't bother looking for more
- // the min of the min length and the max requested chars.
- maxChars = Math.Min(tree.MinRequiredLength, maxChars);
- if (maxChars <= 1)
+ // If we don't have any results, try harder to compute one for the starting character.
+ // This is a more involved computation that can find things the fixed-distance investigation
+ // doesn't.
+ if (results.Count == 0)
{
- return null;
+ (string CharClass, bool CaseInsensitive)? first = FindFirstCharClass(tree, culture);
+ if (first is not null)
+ {
+ results.Add((null, first.Value.CharClass, 0, first.Value.CaseInsensitive));
+ }
+
+ if (results.Count == 0)
+ {
+ return null;
+ }
}
- // Find an alternation on the path to the first node. If we can't, bail.
- RegexNode node = tree.Root;
- while (node.Type != RegexNode.Alternate)
+ // For every entry, see if we can mark any that are case-insensitive as actually being case-sensitive
+ // based on not participating in case conversion. And then for ones that are case-sensitive, try to
+ // get the chars that make up the set, if there are few enough.
+ Span<char> scratch = stackalloc char[5]; // max optimized by IndexOfAny today
+ for (int i = 0; i < results.Count; i++)
{
- switch (node.Type)
+ (char[]? Chars, string Set, int Distance, bool CaseInsensitive) result = results[i];
+ if (!RegexCharClass.IsNegated(result.Set))
{
- case RegexNode.Atomic:
- case RegexNode.Capture:
- case RegexNode.Concatenate:
- node = node.Child(0);
- break;
+ int count = RegexCharClass.GetSetChars(result.Set, scratch);
+ if (count != 0)
+ {
+ if (result.CaseInsensitive && !RegexCharClass.ParticipatesInCaseConversion(scratch.Slice(0, count)))
+ {
+ result.CaseInsensitive = false;
+ }
- default:
- return null;
+ if (!result.CaseInsensitive)
+ {
+ result.Chars = scratch.Slice(0, count).ToArray();
+ }
+
+ results[i] = result;
+ }
}
}
- Debug.Assert(node.Type == RegexNode.Alternate);
- // Create RegexCharClasses to store the built-up sets. We may end up returning fewer
- // than this if we find we can't easily fill this number of sets with 100% confidence.
- var classes = new RegexCharClass?[maxChars];
- bool caseInsensitive = false;
+ // Finally, try to move the "best" results to be earlier. "best" here are ones we're able to search
+ // for the fastest and that have the best chance of matching as few false positives as possible.
+ results.Sort((s1, s2) =>
+ {
+ if (s1.CaseInsensitive != s2.CaseInsensitive)
+ {
+ // If their case-sensitivities don't match, whichever is case-sensitive comes first / is considered lower.
+ return s1.CaseInsensitive ? 1 : -1;
+ }
+
+ if (s1.Chars is not null && s2.Chars is not null)
+ {
+ // Then of the ones that are the same length, prefer those with less frequent values. The frequency is
+ // only an approximation, used as a tie-breaker when we'd otherwise effectively be picking randomly. True
+ // frequencies will vary widely based on the actual data being searched, the language of the data, etc.
+ int c = SumFrequencies(s1.Chars).CompareTo(SumFrequencies(s2.Chars));
+ if (c != 0)
+ {
+ return c;
+ }
- int branches = node.ChildCount();
- Debug.Assert(branches >= 2);
- for (int branchNum = 0; branchNum < branches; branchNum++)
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ static float SumFrequencies(char[] chars)
+ {
+ float sum = 0;
+ foreach (char c in chars)
+ {
+ // Lookup each character in the table. For values > 255, this will end up truncating
+ // and thus we'll get skew in the data. It's already a gross approximation, though,
+ // and it is primarily meant for disambiguation of ASCII letters.
+ sum += s_frequency[(byte)c];
+ }
+ return sum;
+ }
+ }
+ else if (s1.Chars is not null)
+ {
+ // If s1 has chars and s2 doesn't, then s1 has fewer chars.
+ return -1;
+ }
+ else if (s2.Chars is not null)
+ {
+ // If s2 has chars and s1 doesn't, then s2 has fewer chars.
+ return 1;
+ }
+
+ return s1.Distance.CompareTo(s2.Distance);
+ });
+
+ return results;
+
+ // Starting from the specified root node, populates results with any characters at a fixed distance
+ // from the node's starting position. The function returns true if the entire contents of the node
+ // is at a fixed distance, in which case distance will have been updated to include the full length
+ // of the node. If it returns false, the node isn't entirely fixed, in which case subsequent nodes
+ // shouldn't be examined and distance should no longer be trusted. However, regardless of whether it
+ // returns true or false, it may have populated results, and all populated results are valid.
+ static bool TryFindFixedSets(RegexNode node, List<(char[]? Chars, string Set, int Distance, bool CaseInsensitive)> results, ref int distance, CultureInfo culture, bool thorough)
{
- RegexNode alternateBranch = node.Child(branchNum);
- caseInsensitive |= (alternateBranch.Options & RegexOptions.IgnoreCase) != 0;
+ if (!StackHelper.TryEnsureSufficientExecutionStack())
+ {
+ return false;
+ }
- switch (alternateBranch.Type)
+ if ((node.Options & RegexOptions.RightToLeft) != 0)
{
+ return false;
+ }
+
+ bool caseInsensitive = (node.Options & RegexOptions.IgnoreCase) != 0;
+
+ switch (node.Type)
+ {
+ case RegexNode.One:
+ if (results.Count < MaxFixedResults)
+ {
+ string setString = RegexCharClass.OneToStringClass(node.Ch, caseInsensitive ? culture : null, out bool resultIsCaseInsensitive);
+ results.Add((null, setString, distance++, resultIsCaseInsensitive));
+ return true;
+ }
+ return false;
+
+ case RegexNode.Onelazy or RegexNode.Oneloop or RegexNode.Oneloopatomic when node.M > 0:
+ {
+ string setString = RegexCharClass.OneToStringClass(node.Ch, caseInsensitive ? culture : null, out bool resultIsCaseInsensitive);
+ int minIterations = Math.Min(node.M, MaxLoopExpansion);
+ int i = 0;
+ for (; i < minIterations && results.Count < MaxFixedResults; i++)
+ {
+ results.Add((null, setString, distance++, resultIsCaseInsensitive));
+ }
+ return i == node.M && i == node.N;
+ }
+
case RegexNode.Multi:
- maxChars = Math.Min(maxChars, alternateBranch.Str!.Length);
- for (int i = 0; i < maxChars; i++)
{
- (classes[i] ??= new RegexCharClass()).AddChar(alternateBranch.Str[i]);
+ string s = node.Str!;
+ int i = 0;
+ for (; i < s.Length && results.Count < MaxFixedResults; i++)
+ {
+ string setString = RegexCharClass.OneToStringClass(s[i], caseInsensitive ? culture : null, out bool resultIsCaseInsensitive);
+ results.Add((null, setString, distance++, resultIsCaseInsensitive));
+ }
+ return i == s.Length;
}
- continue;
+
+ case RegexNode.Set:
+ if (results.Count < MaxFixedResults)
+ {
+ results.Add((null, node.Str!, distance++, caseInsensitive));
+ return true;
+ }
+ return false;
+
+ case RegexNode.Setlazy or RegexNode.Setloop or RegexNode.Setloopatomic when node.M > 0:
+ {
+ int minIterations = Math.Min(node.M, MaxLoopExpansion);
+ int i = 0;
+ for (; i < minIterations && results.Count < MaxFixedResults; i++)
+ {
+ results.Add((null, node.Str!, distance++, caseInsensitive));
+ }
+ return i == node.M && i == node.N;
+ }
+
+ case RegexNode.Notone:
+ // We could create a set out of Notone, but it will be of little value in helping to improve
+ // the speed of finding the first place to match, as almost every character will match it.
+ distance++;
+ return true;
+
+ case RegexNode.Notonelazy or RegexNode.Notoneloop or RegexNode.Notoneloopatomic when node.M == node.N:
+ distance += node.M;
+ return true;
+
+ case RegexNode.Beginning:
+ case RegexNode.Bol:
+ case RegexNode.Boundary:
+ case RegexNode.ECMABoundary:
+ case RegexNode.Empty:
+ case RegexNode.End:
+ case RegexNode.EndZ:
+ case RegexNode.Eol:
+ case RegexNode.NonBoundary:
+ case RegexNode.NonECMABoundary:
+ case RegexNode.UpdateBumpalong:
+ case RegexNode.Start:
+ case RegexNode.Prevent:
+ case RegexNode.Require:
+ // Zero-width anchors and assertions. In theory for Prevent and Require we could also investigate
+ // them and use the learned knowledge to impact the generated sets, at least for lookaheads.
+ // For now, we don't bother.
+ return true;
+
+ case RegexNode.Atomic:
+ case RegexNode.Group:
+ case RegexNode.Capture:
+ return TryFindFixedSets(node.Child(0), results, ref distance, culture, thorough);
+
+ case RegexNode.Lazyloop or RegexNode.Loop when node.M > 0:
+ // This effectively only iterates the loop once. If deemed valuable,
+ // it could be updated in the future to duplicate the found results
+ // (updated to incorporate distance from previous iterations) and
+ // summed distance for all node.M iterations. If node.M == node.N,
+ // this would then also allow continued evaluation of the rest of the
+ // expression after the loop.
+ TryFindFixedSets(node.Child(0), results, ref distance, culture, thorough);
+ return false;
case RegexNode.Concatenate:
{
- int classPos = 0;
- int concatChildren = alternateBranch.ChildCount();
- for (int i = 0; i < concatChildren && classPos < classes.Length; i++)
+ int childCount = node.ChildCount();
+ for (int i = 0; i < childCount; i++)
{
- RegexNode concatChild = alternateBranch.Child(i);
- caseInsensitive |= (concatChild.Options & RegexOptions.IgnoreCase) != 0;
+ if (!TryFindFixedSets(node.Child(i), results, ref distance, culture, thorough))
+ {
+ return false;
+ }
+ }
+ return true;
+ }
- switch (concatChild.Type)
+ case RegexNode.Alternate when thorough:
+ {
+ int childCount = node.ChildCount();
+ bool allSameSize = true;
+ int? sameDistance = null;
+ var combined = new Dictionary<int, (RegexCharClass Set, bool CaseInsensitive, int Count)>();
+
+ var localResults = new List<(char[]? Chars, string Set, int Distance, bool CaseInsensitive)>();
+ for (int i = 0; i < childCount; i++)
+ {
+ localResults.Clear();
+ int localDistance = 0;
+ allSameSize &= TryFindFixedSets(node.Child(i), localResults, ref localDistance, culture, thorough);
+
+ if (localResults.Count == 0)
{
- case RegexNode.One:
- (classes[classPos++] ??= new RegexCharClass()).AddChar(concatChild.Ch);
- break;
- case RegexNode.Set:
- if (!(classes[classPos++] ??= new RegexCharClass()).TryAddCharClass(RegexCharClass.Parse(concatChild.Str!)))
- {
- // If the classes can't be merged, give up.
- return null;
- }
- break;
- case RegexNode.Multi:
- for (int c = 0; c < concatChild.Str!.Length && classPos < classes.Length; c++)
+ return false;
+ }
+
+ if (allSameSize)
+ {
+ if (sameDistance is null)
+ {
+ sameDistance = localDistance;
+ }
+ else if (sameDistance.Value != localDistance)
+ {
+ allSameSize = false;
+ }
+ }
+
+ foreach ((char[]? Chars, string Set, int Distance, bool CaseInsensitive) fixedSet in localResults)
+ {
+ if (combined.TryGetValue(fixedSet.Distance, out (RegexCharClass Set, bool CaseInsensitive, int Count) value))
+ {
+ if (fixedSet.CaseInsensitive == value.CaseInsensitive &&
+ value.Set.TryAddCharClass(RegexCharClass.Parse(fixedSet.Set)))
{
- (classes[classPos++] ??= new RegexCharClass()).AddChar(concatChild.Str[c]);
+ value.Count++;
+ combined[fixedSet.Distance] = value;
}
- break;
+ }
+ else
+ {
+ combined[fixedSet.Distance] = (RegexCharClass.Parse(fixedSet.Set), fixedSet.CaseInsensitive, 1);
+ }
+ }
+ }
+
+ foreach (KeyValuePair<int, (RegexCharClass Set, bool CaseInsensitive, int Count)> pair in combined)
+ {
+ if (results.Count >= MaxFixedResults)
+ {
+ allSameSize = false;
+ break;
+ }
- default: // nothing else supported
- i = concatChildren; // stop looking at additional nodes
- break;
+ if (pair.Value.Count == childCount)
+ {
+ results.Add((null, pair.Value.Set.ToStringClass(), pair.Key + distance, pair.Value.CaseInsensitive));
}
}
- maxChars = Math.Min(maxChars, classPos);
+ if (allSameSize)
+ {
+ Debug.Assert(sameDistance.HasValue);
+ distance += sameDistance.Value;
+ return true;
+ }
+
+ return false;
}
- continue;
default:
- // Any other node type as a branch in the alternation and we give up. Note that we don't special-case One/Notone/Set
- // because that would mean the whole branch was a single char, in which case this computation provides
- // zero benefit over the ComputeFirstCharClass computation.
- return null;
+ return false;
}
}
+ }
- // We've now examined all of the alternate branches and were able to successfully process them.
- // Determine how many we can actually return.
- for (int i = 0; i < maxChars; i++)
- {
- if (classes[i] is null)
- {
- maxChars = i;
- break;
- }
- }
+ // Computes a character class for the first character in tree. This uses a more robust algorithm
+ // than is used by TryFindFixedLiterals and thus can find starting sets it couldn't. For example,
+ // fixed literals won't find the starting set for a*b, as the a isn't guaranteed and the b is at a
+ // variable position, but this will find [ab] as it's instead looking for anything that under any
+ // circumstance could possibly start a match.
+ public static (string CharClass, bool CaseInsensitive)? FindFirstCharClass(RegexTree tree, CultureInfo culture)
+ {
+ var s = new RegexPrefixAnalyzer(stackalloc int[StackBufferSize]);
+ RegexFC? fc = s.RegexFCFromRegexTree(tree);
+ s.Dispose();
- // Make sure we got something.
- if (maxChars == 0)
+ if (fc == null || fc._nullable)
{
return null;
}
- // Create and return the RegexPrefix objects.
- var prefixes = new (string CharClass, bool CaseInsensitive)[maxChars];
-
- CultureInfo? ci = null;
- if (caseInsensitive)
- {
- ci = (tree.Options & RegexOptions.CultureInvariant) != 0 ? CultureInfo.InvariantCulture : CultureInfo.CurrentCulture;
- }
-
- for (int i = 0; i < prefixes.Length; i++)
+ if (fc.CaseInsensitive)
{
- if (caseInsensitive)
- {
- classes[i]!.AddLowercase(ci!);
- }
- prefixes[i] = (classes[i]!.ToStringClass(), caseInsensitive);
+ fc.AddLowercase(culture);
}
- return prefixes;
+ return (fc.GetFirstChars(), fc.CaseInsensitive);
}
/// <summary>Takes a RegexTree and computes the leading anchor that it encounters.</summary>
throw new ArgumentException(SR.Format(SR.UnexpectedOpcode, NodeType.ToString(CultureInfo.CurrentCulture)));
}
}
+
+ /// <summary>Percent occurrences in source text (100 * char count / total count).</summary>
+ private static readonly float[] s_frequency = new float[]
+ {
+ 0.000f /* '\x00' */, 0.000f /* '\x01' */, 0.000f /* '\x02' */, 0.000f /* '\x03' */, 0.000f /* '\x04' */, 0.000f /* '\x05' */, 0.000f /* '\x06' */, 0.000f /* '\x07' */,
+ 0.000f /* '\x08' */, 0.001f /* '\x09' */, 0.000f /* '\x0A' */, 0.000f /* '\x0B' */, 0.000f /* '\x0C' */, 0.000f /* '\x0D' */, 0.000f /* '\x0E' */, 0.000f /* '\x0F' */,
+ 0.000f /* '\x10' */, 0.000f /* '\x11' */, 0.000f /* '\x12' */, 0.000f /* '\x13' */, 0.003f /* '\x14' */, 0.000f /* '\x15' */, 0.000f /* '\x16' */, 0.000f /* '\x17' */,
+ 0.000f /* '\x18' */, 0.004f /* '\x19' */, 0.000f /* '\x1A' */, 0.000f /* '\x1B' */, 0.006f /* '\x1C' */, 0.006f /* '\x1D' */, 0.000f /* '\x1E' */, 0.000f /* '\x1F' */,
+ 8.952f /* ' ' */, 0.065f /* ' !' */, 0.420f /* ' "' */, 0.010f /* ' #' */, 0.011f /* ' $' */, 0.005f /* ' %' */, 0.070f /* ' &' */, 0.050f /* ' '' */,
+ 3.911f /* ' (' */, 3.910f /* ' )' */, 0.356f /* ' *' */, 2.775f /* ' +' */, 1.411f /* ' ,' */, 0.173f /* ' -' */, 2.054f /* ' .' */, 0.677f /* ' /' */,
+ 1.199f /* ' 0' */, 0.870f /* ' 1' */, 0.729f /* ' 2' */, 0.491f /* ' 3' */, 0.335f /* ' 4' */, 0.269f /* ' 5' */, 0.435f /* ' 6' */, 0.240f /* ' 7' */,
+ 0.234f /* ' 8' */, 0.196f /* ' 9' */, 0.144f /* ' :' */, 0.983f /* ' ;' */, 0.357f /* ' <' */, 0.661f /* ' =' */, 0.371f /* ' >' */, 0.088f /* ' ?' */,
+ 0.007f /* ' @' */, 0.763f /* ' A' */, 0.229f /* ' B' */, 0.551f /* ' C' */, 0.306f /* ' D' */, 0.449f /* ' E' */, 0.337f /* ' F' */, 0.162f /* ' G' */,
+ 0.131f /* ' H' */, 0.489f /* ' I' */, 0.031f /* ' J' */, 0.035f /* ' K' */, 0.301f /* ' L' */, 0.205f /* ' M' */, 0.253f /* ' N' */, 0.228f /* ' O' */,
+ 0.288f /* ' P' */, 0.034f /* ' Q' */, 0.380f /* ' R' */, 0.730f /* ' S' */, 0.675f /* ' T' */, 0.265f /* ' U' */, 0.309f /* ' V' */, 0.137f /* ' W' */,
+ 0.084f /* ' X' */, 0.023f /* ' Y' */, 0.023f /* ' Z' */, 0.591f /* ' [' */, 0.085f /* ' \' */, 0.590f /* ' ]' */, 0.013f /* ' ^' */, 0.797f /* ' _' */,
+ 0.001f /* ' `' */, 4.596f /* ' a' */, 1.296f /* ' b' */, 2.081f /* ' c' */, 2.005f /* ' d' */, 6.903f /* ' e' */, 1.494f /* ' f' */, 1.019f /* ' g' */,
+ 1.024f /* ' h' */, 3.750f /* ' i' */, 0.286f /* ' j' */, 0.439f /* ' k' */, 2.913f /* ' l' */, 1.459f /* ' m' */, 3.908f /* ' n' */, 3.230f /* ' o' */,
+ 1.444f /* ' p' */, 0.231f /* ' q' */, 4.220f /* ' r' */, 3.924f /* ' s' */, 5.312f /* ' t' */, 2.112f /* ' u' */, 0.737f /* ' v' */, 0.573f /* ' w' */,
+ 0.992f /* ' x' */, 1.067f /* ' y' */, 0.181f /* ' z' */, 0.391f /* ' {' */, 0.056f /* ' |' */, 0.391f /* ' }' */, 0.002f /* ' ~' */, 0.000f /* '\x7F' */,
+ 0.000f /* '\x80' */, 0.000f /* '\x81' */, 0.000f /* '\x82' */, 0.000f /* '\x83' */, 0.000f /* '\x84' */, 0.000f /* '\x85' */, 0.000f /* '\x86' */, 0.000f /* '\x87' */,
+ 0.000f /* '\x88' */, 0.000f /* '\x89' */, 0.000f /* '\x8A' */, 0.000f /* '\x8B' */, 0.000f /* '\x8C' */, 0.000f /* '\x8D' */, 0.000f /* '\x8E' */, 0.000f /* '\x8F' */,
+ 0.000f /* '\x90' */, 0.000f /* '\x91' */, 0.000f /* '\x92' */, 0.000f /* '\x93' */, 0.000f /* '\x94' */, 0.000f /* '\x95' */, 0.000f /* '\x96' */, 0.000f /* '\x97' */,
+ 0.000f /* '\x98' */, 0.000f /* '\x99' */, 0.000f /* '\x9A' */, 0.000f /* '\x9B' */, 0.000f /* '\x9C' */, 0.000f /* '\x9D' */, 0.000f /* '\x9E' */, 0.000f /* '\x9F' */,
+ 0.000f /* '\xA0' */, 0.000f /* '\xA1' */, 0.000f /* '\xA2' */, 0.000f /* '\xA3' */, 0.000f /* '\xA4' */, 0.000f /* '\xA5' */, 0.000f /* '\xA6' */, 0.000f /* '\xA7' */,
+ 0.000f /* '\xA8' */, 0.000f /* '\xA9' */, 0.000f /* '\xAA' */, 0.000f /* '\xAB' */, 0.000f /* '\xAC' */, 0.000f /* '\xAD' */, 0.000f /* '\xAE' */, 0.000f /* '\xAF' */,
+ 0.000f /* '\xB0' */, 0.000f /* '\xB1' */, 0.000f /* '\xB2' */, 0.000f /* '\xB3' */, 0.000f /* '\xB4' */, 0.000f /* '\xB5' */, 0.000f /* '\xB6' */, 0.000f /* '\xB7' */,
+ 0.000f /* '\xB8' */, 0.000f /* '\xB9' */, 0.000f /* '\xBA' */, 0.000f /* '\xBB' */, 0.000f /* '\xBC' */, 0.000f /* '\xBD' */, 0.000f /* '\xBE' */, 0.000f /* '\xBF' */,
+ 0.000f /* '\xC0' */, 0.000f /* '\xC1' */, 0.000f /* '\xC2' */, 0.000f /* '\xC3' */, 0.000f /* '\xC4' */, 0.000f /* '\xC5' */, 0.000f /* '\xC6' */, 0.000f /* '\xC7' */,
+ 0.000f /* '\xC8' */, 0.000f /* '\xC9' */, 0.000f /* '\xCA' */, 0.000f /* '\xCB' */, 0.000f /* '\xCC' */, 0.000f /* '\xCD' */, 0.000f /* '\xCE' */, 0.000f /* '\xCF' */,
+ 0.000f /* '\xD0' */, 0.000f /* '\xD1' */, 0.000f /* '\xD2' */, 0.000f /* '\xD3' */, 0.000f /* '\xD4' */, 0.000f /* '\xD5' */, 0.000f /* '\xD6' */, 0.000f /* '\xD7' */,
+ 0.000f /* '\xD8' */, 0.000f /* '\xD9' */, 0.000f /* '\xDA' */, 0.000f /* '\xDB' */, 0.000f /* '\xDC' */, 0.000f /* '\xDD' */, 0.000f /* '\xDE' */, 0.000f /* '\xDF' */,
+ 0.000f /* '\xE0' */, 0.000f /* '\xE1' */, 0.000f /* '\xE2' */, 0.000f /* '\xE3' */, 0.000f /* '\xE4' */, 0.000f /* '\xE5' */, 0.000f /* '\xE6' */, 0.000f /* '\xE7' */,
+ 0.000f /* '\xE8' */, 0.000f /* '\xE9' */, 0.000f /* '\xEA' */, 0.000f /* '\xEB' */, 0.000f /* '\xEC' */, 0.000f /* '\xED' */, 0.000f /* '\xEE' */, 0.000f /* '\xEF' */,
+ 0.000f /* '\xF0' */, 0.000f /* '\xF1' */, 0.000f /* '\xF2' */, 0.000f /* '\xF3' */, 0.000f /* '\xF4' */, 0.000f /* '\xF5' */, 0.000f /* '\xF6' */, 0.000f /* '\xF7' */,
+ 0.000f /* '\xF8' */, 0.000f /* '\xF9' */, 0.000f /* '\xFA' */, 0.000f /* '\xFB' */, 0.000f /* '\xFC' */, 0.000f /* '\xFD' */, 0.000f /* '\xFE' */, 0.000f /* '\xFF' */,
+ };
+
+ // The above table was generated programmatically with the following. This can be augmented to incorporate additional data sources,
+ // though it is only intended to be a rough approximation use when tie-breaking and we'd otherwise be picking randomly, so, it's something.
+ // The frequencies may be wildly inaccurate when used with data sources different in nature than the training set, in which case we shouldn't
+ // be much worse off than just picking randomly:
+ //
+ // using System.Runtime.InteropServices;
+ //
+ // var counts = new Dictionary<byte, long>();
+ //
+ // (string, string)[] rootsAndExtensions = new[]
+ // {
+ // (@"d:\repos\runtime\src\", "*.cs"), // C# files in dotnet/runtime
+ // (@"d:\Top25GutenbergBooks", "*.txt"), // Top 25 most popular books on Project Gutenberg
+ // };
+ //
+ // foreach ((string root, string ext) in rootsAndExtensions)
+ // foreach (string path in Directory.EnumerateFiles(root, ext, SearchOption.AllDirectories))
+ // foreach (string line in File.ReadLines(path))
+ // foreach (char c in line.AsSpan().Trim())
+ // CollectionsMarshal.GetValueRefOrAddDefault(counts, (byte)c, out _)++;
+ //
+ // long total = counts.Sum(i => i.Value);
+ //
+ // Console.WriteLine("/// <summary>Percent occurrences in source text (100 * char count / total count).</summary>");
+ // Console.WriteLine("private static readonly float[] s_frequency = new float[]");
+ // Console.WriteLine("{");
+ // int i = 0;
+ // for (int row = 0; row < 32; row++)
+ // {
+ // Console.Write(" ");
+ // for (int col = 0; col < 8; col++)
+ // {
+ // counts.TryGetValue((byte)i, out long charCount);
+ // float frequency = (float)(charCount / (double)total) * 100;
+ // Console.Write($" {frequency:N3}f /* '{(i >= 32 && i < 127 ? $" {(char)i}" : $"\\x{i:X2}")}' */,");
+ // i++;
+ // }
+ // Console.WriteLine();
+ // }
+ // Console.WriteLine("};");
}
internal sealed class RegexFC
using System.Collections;
using System.Collections.Generic;
using System.Globalization;
+using System.Runtime.InteropServices;
namespace System.Text.RegularExpressions
{
/// This is the only function that should be called from outside.
/// It takes a RegexTree and creates a corresponding RegexCode.
/// </summary>
- public static RegexCode Write(RegexTree tree)
+ public static RegexCode Write(RegexTree tree, CultureInfo culture)
{
var writer = new RegexWriter(stackalloc int[EmittedSize], stackalloc int[IntStackSize]);
- RegexCode code = writer.RegexCodeFromRegexTree(tree);
+ RegexCode code = writer.RegexCodeFromRegexTree(tree, culture);
writer.Dispose();
#if DEBUG
/// It also computes various information about the tree, such as
/// prefix data to help with optimizations.
/// </summary>
- public RegexCode RegexCodeFromRegexTree(RegexTree tree)
+ public RegexCode RegexCodeFromRegexTree(RegexTree tree, CultureInfo culture)
{
// Construct sparse capnum mapping if some numbers are unused.
int capsize;
Emit(RegexCode.Stop);
int[] emitted = _emitted.AsSpan().ToArray();
- bool rtl = (tree.Options & RegexOptions.RightToLeft) != 0;
- bool compiled = (tree.Options & RegexOptions.Compiled) != 0;
-
- // Compute prefixes to help optimize FindFirstChar.
- RegexBoyerMoore? boyerMoorePrefix = null;
- (string CharClass, bool CaseInsensitive)[]? leadingCharClasses = null;
- (string leadingSubstring, bool leadingSubstringCI) = RegexPrefixAnalyzer.ComputeLeadingSubstring(tree);
- if (leadingSubstring.Length > 1 && // if it's <= 1, perf is better using leadingCharClasses
- leadingSubstring.Length <= RegexBoyerMoore.MaxLimit)
- {
- // Compute a Boyer-Moore prefix if we find a single string of sufficient length that always begins the expression.
- CultureInfo culture = (tree.Options & RegexOptions.CultureInvariant) != 0 ? CultureInfo.InvariantCulture : CultureInfo.CurrentCulture;
- boyerMoorePrefix = new RegexBoyerMoore(leadingSubstring, leadingSubstringCI, rtl, culture);
- }
-
- // If we didn't find a single leading substring, or if we found one but we won't be able to use it for a Boyer-Moore
- // search, try to compute the characters set that might begin the string.
- if (boyerMoorePrefix is null ||
- (boyerMoorePrefix.NegativeUnicode != null && compiled)) // compilation won't use Boyer-Moore if it has a negative Unicode table
- {
- boyerMoorePrefix = null;
-
- // First we employ a less aggressive but more valuable computation to see if we can find sets for each of the first N
- // characters in the string. If that's unsuccessful, we employ a more aggressive check to compute a set for just
- // the first character in the string.
-
- if ((tree.Options & RegexOptions.Compiled) != 0) // currently not utilized by the interpreter
- {
- leadingCharClasses = RegexPrefixAnalyzer.ComputeMultipleCharClasses(tree, maxChars: 5); // limit of 5 is based on experimentation and can be tweaked as needed
- }
-
- if (leadingCharClasses is null)
- {
- leadingCharClasses = RegexPrefixAnalyzer.ComputeFirstCharClass(tree);
- }
- }
-
- // Compute any anchors starting the expression.
- int leadingAnchor = RegexPrefixAnalyzer.FindLeadingAnchor(tree);
-
// Convert the string table into an ordered string array.
var strings = new string[_stringTable.Count];
foreach (KeyValuePair<string, int> stringEntry in _stringTable)
}
// Return all that in a RegexCode object.
- return new RegexCode(tree, emitted, strings, _trackCount, _caps, capsize, boyerMoorePrefix, leadingCharClasses, leadingAnchor, rtl);
+ return new RegexCode(tree, culture, emitted, strings, _trackCount, _caps, capsize);
}
/// <summary>
/// <summary>
/// Returns an index in the string table for a string;
- /// uses a hashtable to eliminate duplicates.
+ /// uses a dictionary to eliminate duplicates.
/// </summary>
private int StringCode(string str)
{
+#if REGEXGENERATOR
if (!_stringTable.TryGetValue(str, out int i))
{
i = _stringTable.Count;
_stringTable.Add(str, i);
}
-
+#else
+ ref int i = ref CollectionsMarshal.GetValueRefOrAddDefault(_stringTable, str, out bool exists);
+ if (!exists)
+ {
+ i = _stringTable.Count - 1;
+ }
+#endif
return i;
}
private void EmitFragment(int nodetype, RegexNode node, int curIndex)
{
int bits = 0;
- if (node.UseOptionR())
+ if ((node.Options & RegexOptions.RightToLeft) != 0)
{
bits |= RegexCode.Rtl;
}
/// Serializer uses more compacted representations when fewer bits are needed, which is reflected in the first
/// two numbers of the return value. MTBDD terminals are represented by negated numbers as -id.
/// </summary>
+ [ExcludeFromCodeCoverage]
public long[] Serialize()
{
if (IsEmpty)
using System.Collections.Generic;
using System.Diagnostics;
-using System.Runtime.CompilerServices;
using System.Threading;
namespace System.Text.RegularExpressions.Symbolic
lock (this)
{
state.Id = _stateCache.Count;
- int k = state.GetHashCode();
_stateCache.Add(state);
Debug.Assert(_statearray is not null);
private SymbolicRegexInfo(uint i) => _info = i;
- /// <summary>Optimized lookup array for most common combinations.</summary>
- /// <remarks>Most common cases will be 0 (no anchors and not nullable) and 1 (no anchors and nullable)</remarks>
- private static readonly SymbolicRegexInfo[] s_infos = CreateSymbolicRegexInfos();
-
- private static SymbolicRegexInfo[] CreateSymbolicRegexInfos()
- {
- var infos = new SymbolicRegexInfo[128];
- for (uint i = 0; i < infos.Length; i++)
- {
- infos[i] = new SymbolicRegexInfo(i);
- }
- return infos;
- }
-
- private static SymbolicRegexInfo Mk(uint i)
- {
- SymbolicRegexInfo[] infos = s_infos;
- return i < infos.Length ?
- infos[i] :
- new SymbolicRegexInfo(i);
- }
-
internal static SymbolicRegexInfo Mk(bool isAlwaysNullable = false, bool canBeNullable = false, bool startsWithLineAnchor = false,
bool startsWithBoundaryAnchor = false, bool containsSomeAnchor = false,
bool containsLineAnchor = false, bool containsSomeCharacter = false, bool isLazy = true)
i |= IsLazyMask;
}
- return Mk(i);
+ return new SymbolicRegexInfo(i);
}
public bool IsNullable => (_info & IsAlwaysNullableMask) != 0;
}
i = (i & ~IsLazyMask) | isLazy;
- return Mk(i);
+ return new SymbolicRegexInfo(i);
}
public static SymbolicRegexInfo And(params SymbolicRegexInfo[] infos)
i = (i & ~IsLazyMask) | isLazy;
i = (i & ~(IsAlwaysNullableMask | CanBeNullableMask)) | isNullable;
- return Mk(i);
+ return new SymbolicRegexInfo(i);
}
public static SymbolicRegexInfo Concat(SymbolicRegexInfo left_info, SymbolicRegexInfo right_info)
uint i = body_info._info;
// The loop is nullable if either the body is nullable or if the lower boud is 0
- i |= lowerBound == 0 ? (IsAlwaysNullableMask | CanBeNullableMask) : 0;
+ if (lowerBound == 0)
+ {
+ i |= IsAlwaysNullableMask | CanBeNullableMask;
+ }
// The loop is lazy iff it is marked lazy
if (isLazy)
i &= ~IsLazyMask;
}
- return Mk(i);
+ return new SymbolicRegexInfo(i);
}
public static SymbolicRegexInfo Not(SymbolicRegexInfo info) =>
namespace System.Text.RegularExpressions.Symbolic
{
/// <summary>Represents a regex matching engine that performs regex matching using symbolic derivatives.</summary>
- internal abstract class SymbolicRegexMatcher
+ internal interface ISymbolicRegexMatcher
{
- /// <summary>Returns the next match index and length in the input string.</summary>
- /// <param name="isMatch">Whether to return once we know there's a match without determining where exactly it matched.</param>
- /// <param name="input">The input string.</param>
- /// <param name="startat">The start position in the input.</param>
- /// <param name="endat">The end position in the input.</param>
- public abstract SymbolicMatch FindMatch(bool isMatch, string input, int startat, int endat);
-
#if DEBUG
/// <summary>Unwind the regex of the matcher and save the resulting state graph in DGML</summary>
/// <param name="bound">roughly the maximum number of states, 0 means no bound</param>
/// <param name="writer">dgml output is written here</param>
/// <param name="maxLabelLength">maximum length of labels in nodes anything over that length is indicated with .. </param>
/// <param name="asNFA">if true creates NFA instead of DFA</param>
- public abstract void SaveDGML(TextWriter writer, int bound, bool hideStateInfo, bool addDotStar, bool inReverse, bool onlyDFAinfo, int maxLabelLength, bool asNFA);
-
+ void SaveDGML(TextWriter writer, int bound, bool hideStateInfo, bool addDotStar, bool inReverse, bool onlyDFAinfo, int maxLabelLength, bool asNFA);
/// <summary>
/// Generates up to k random strings matched by the regex
/// <param name="randomseed">random seed for the generator, 0 means no random seed</param>
/// <param name="negative">if true then generate inputs that do not match</param>
/// <returns></returns>
- public abstract IEnumerable<string> GenerateRandomMembers(int k, int randomseed, bool negative);
+ IEnumerable<string> GenerateRandomMembers(int k, int randomseed, bool negative);
#endif
}
/// <summary>Represents a regex matching engine that performs regex matching using symbolic derivatives.</summary>
/// <typeparam name="TSetType">Character set type.</typeparam>
- internal sealed class SymbolicRegexMatcher<TSetType> : SymbolicRegexMatcher where TSetType : notnull
+ internal sealed class SymbolicRegexMatcher<TSetType> : ISymbolicRegexMatcher where TSetType : notnull
{
/// <summary>Maximum number of states before switching over to Antimirov mode.</summary>
/// <remarks>
/// <summary>Timeout in milliseconds. This is only used if <see cref="_checkTimeout"/> is true.</summary>
private readonly int _timeout;
- /// <summary>Classifier used to say whether a particular character can start a match for <see cref="_pattern"/>.</summary>
- internal readonly BooleanClassifier _startSetClassifier;
-
- /// <summary>Predicate over characters that make some progress</summary>
- private readonly TSetType _startSet;
-
- /// <summary>Maximum allowed size of <see cref="_startSetArray"/>.</summary>
- private const int StartSetArrayMaxSize = 5;
-
- /// <summary>String of at most <see cref="StartSetArrayMaxSize"/> many characters</summary>
- private readonly char[] _startSetArray;
-
- /// <summary>Number of elements in <see cref="_startSetClassifier"/></summary>
- private readonly int _startSetSize;
-
- /// <summary>If nonempty then <see cref="_pattern"/> has that fixed prefix</summary>
- private readonly string _prefix;
+ /// <summary>Data and routines for skipping ahead to the next place a match could potentially start.</summary>
+ private readonly RegexFindOptimizations? _findOpts;
- /// <summary>Non-null when <see cref="_prefix"/> is nonempty</summary>
- private readonly RegexBoyerMoore? _prefixBoyerMoore;
+ /// <summary>The initial states for the original pattern, keyed off of the previous character kind.</summary>
+ /// <remarks>If the pattern doesn't contain any anchors, there will only be a single initial state.</remarks>
+ private readonly DfaMatchingState<TSetType>[] _initialStates;
- /// <summary>If true then the fixed prefix of <see cref="_pattern"/> is idependent of case</summary>
- private readonly bool _isPrefixCaseInsensitive;
+ /// <summary>The initial states for the dot-star pattern, keyed off of the previous character kind.</summary>
+ /// <remarks>If the pattern doesn't contain any anchors, there will only be a single initial state.</remarks>
+ private readonly DfaMatchingState<TSetType>[] _dotstarredInitialStates;
- /// <summary>Cached skip states from the initial state of <see cref="_dotStarredPattern"/> for the 5 possible previous character kinds.</summary>
- private readonly DfaMatchingState<TSetType>?[] _prefixSkipStates = new DfaMatchingState<TSetType>[CharKind.CharKindCount];
- /// <summary>Cached skip states from the initial state of Ar for the 5 possible previous character kinds.</summary>
- private readonly DfaMatchingState<TSetType>?[] _reversePrefixSkipStates = new DfaMatchingState<TSetType>[CharKind.CharKindCount];
+ /// <summary>The initial states for the reverse pattern, keyed off of the previous character kind.</summary>
+ /// <remarks>If the pattern doesn't contain any anchors, there will only be a single initial state.</remarks>
+ private readonly DfaMatchingState<TSetType>[] _reverseInitialStates;
- private readonly string _reversePrefix;
-
- private readonly DfaMatchingState<TSetType>[] _initialStates = new DfaMatchingState<TSetType>[CharKind.CharKindCount];
- private readonly DfaMatchingState<TSetType>[] _dotstarredInitialStates = new DfaMatchingState<TSetType>[CharKind.CharKindCount];
- private readonly DfaMatchingState<TSetType>[] _reverseInitialStates = new DfaMatchingState<TSetType>[CharKind.CharKindCount];
-
- private readonly uint[] _asciiCharKinds = new uint[128];
-
- internal readonly CultureInfo _culture;
-
- private DfaMatchingState<TSetType> GetSkipState(uint prevCharKind) =>
- Volatile.Read(ref _prefixSkipStates[prevCharKind]) ??
- Interlocked.CompareExchange(ref _prefixSkipStates[prevCharKind], DeltaPlus<BrzozowskiTransition>(_prefix, _dotstarredInitialStates[prevCharKind]), null) ??
- _prefixSkipStates[prevCharKind]!;
-
- private DfaMatchingState<TSetType> GetReverseSkipState(uint prevCharKind) =>
- Volatile.Read(ref _reversePrefixSkipStates[prevCharKind]) ??
- Interlocked.CompareExchange(ref _reversePrefixSkipStates[prevCharKind], DeltaPlus<BrzozowskiTransition>(_reversePrefix, _reverseInitialStates[prevCharKind]), null) ??
- _reversePrefixSkipStates[prevCharKind]!;
+ /// <summary>Lookup table to quickly determine the character kind for ASCII characters.</summary>
+ /// <remarks>Non-null iff the pattern contains anchors; otherwise, it's unused.</remarks>
+ private readonly uint[]? _asciiCharKinds;
/// <summary>Get the minterm of <paramref name="c"/>.</summary>
/// <param name="c">character code</param>
}
/// <summary>Constructs matcher for given symbolic regex.</summary>
- internal SymbolicRegexMatcher(SymbolicRegexNode<TSetType> sr, CharSetSolver css, BDD[] minterms, TimeSpan matchTimeout, CultureInfo culture)
+ internal SymbolicRegexMatcher(SymbolicRegexNode<TSetType> sr, RegexCode code, CharSetSolver css, BDD[] minterms, TimeSpan matchTimeout, CultureInfo culture)
{
+ Debug.Assert(sr._builder._solver is BV64Algebra or BVAlgebra or CharSetSolver, $"Unsupported algebra: {sr._builder._solver}");
+
_pattern = sr;
_builder = sr._builder;
-
_checkTimeout = Regex.InfiniteMatchTimeout != matchTimeout;
_timeout = (int)(matchTimeout.TotalMilliseconds + 0.5); // Round up, so it will be at least 1ms
- _culture = culture;
-
- Debug.Assert(_builder._solver is BV64Algebra or BVAlgebra or CharSetSolver, $"Unsupported algebra: {_builder._solver}");
_partitions = _builder._solver switch
{
BV64Algebra bv64 => bv64._classifier,
_ => new MintermClassifier((CharSetSolver)(object)_builder._solver, minterms),
};
- _dotStarredPattern = _builder.MkConcat(_builder._anyStar, _pattern);
- _reversePattern = _pattern.Reverse();
- ConfigureRegexes();
-
- _startSet = _pattern.GetStartSet();
- if (!_builder._solver.IsSatisfiable(_startSet) || _pattern.CanBeNullable)
+ if (code.FindOptimizations.FindMode != FindNextStartingPositionMode.NoSearch &&
+ code.FindOptimizations.LeadingAnchor == 0) // If there are any anchors, we're better off letting the DFA quickly do its job of determining whether there's a match.
{
- // If the startset is empty make it full instead by including all characters
- // this is to ensure that startset is nonempty -- as an invariant assumed by operations using it
- //
- // Also, if A can be nullable then effectively disable use of startset by making it true
- // because it may force search of next character in startset and fail to recognize an empty match
- // because (by definition) an empty match has no start character.
- //
- // For example (this is also a unit test):
- // for pattern "\B\W*?" or "\B\W*" or "\B\W?" and input "e.g:abc" there is an empty match in position 5
- // but startset \W will force search beyond position 5 and fails to find that match
- _startSet = _builder._solver.True;
+ _findOpts = code.FindOptimizations;
}
- _startSetSize = (int)_builder._solver.ComputeDomainSize(_startSet);
+ // Determine the number of initial states. If there's no anchor, only the default previous
+ // character kind 0 is ever going to be used for all initial states.
+ int statesCount = _pattern._info.ContainsSomeAnchor ? CharKind.CharKindCount : 1;
- BDD startbdd = _builder._solver.ConvertToCharSet(css, _startSet);
- _startSetClassifier = new BooleanClassifier(css, startbdd);
-
- //store the start characters in the A_startset_array if there are not too many characters
- _startSetArray = _startSetSize <= StartSetArrayMaxSize ?
- new List<char>(css.GenerateAllCharacters(startbdd)).ToArray() :
- Array.Empty<char>();
+ // Create the initial states for the original pattern.
+ var initialStates = new DfaMatchingState<TSetType>[statesCount];
+ for (uint i = 0; i < initialStates.Length; i++)
+ {
+ initialStates[i] = _builder.MkState(_pattern, i);
+ }
+ _initialStates = initialStates;
- _prefix = _pattern.GetFixedPrefix(css, culture.Name, out _isPrefixCaseInsensitive);
- _reversePrefix = _reversePattern.GetFixedPrefix(css, culture.Name, out _);
+ // Create the dot-star pattern (a concatenation of any* with the original pattern)
+ // and all of its initial states.
+ _dotStarredPattern = _builder.MkConcat(_builder._anyStar, _pattern);
+ var dotstarredInitialStates = new DfaMatchingState<TSetType>[statesCount];
+ for (uint i = 0; i < dotstarredInitialStates.Length; i++)
+ {
+ // Used to detect if initial state was reentered,
+ // but observe that the behavior from the state may ultimately depend on the previous
+ // input char e.g. possibly causing nullability of \b or \B or of a start-of-line anchor,
+ // in that sense there can be several "versions" (not more than StateCount) of the initial state.
+ DfaMatchingState<TSetType> state = _builder.MkState(_dotStarredPattern, i);
+ state.IsInitialState = true;
+ dotstarredInitialStates[i] = state;
+ }
+ _dotstarredInitialStates = dotstarredInitialStates;
- _prefixBoyerMoore = InitializePrefixBoyerMoore();
+ // Create the reverse pattern (the original pattern in reverse order) and all of its
+ // initial states.
+ _reversePattern = _pattern.Reverse();
+ var reverseInitialStates = new DfaMatchingState<TSetType>[statesCount];
+ for (uint i = 0; i < reverseInitialStates.Length; i++)
+ {
+ reverseInitialStates[i] = _builder.MkState(_reversePattern, i);
+ }
+ _reverseInitialStates = reverseInitialStates;
+ // Initialize our fast-lookup for determining the character kind of ASCII characters.
+ // This is only required when the pattern contains anchors, as otherwise there's only
+ // ever a single kind used.
if (_pattern._info.ContainsSomeAnchor)
{
- for (int i = 0; i < 128; i++)
+ var asciiCharKinds = new uint[128];
+ for (int i = 0; i < asciiCharKinds.Length; i++)
{
TSetType predicate2;
uint charKind;
charKind = CharKind.WordLetter;
}
- _asciiCharKinds[i] = _builder._solver.And(GetMinterm(i), predicate2).Equals(_builder._solver.False) ? 0 : charKind;
+ asciiCharKinds[i] = _builder._solver.And(GetMinterm(i), predicate2).Equals(_builder._solver.False) ? 0 : charKind;
}
+ _asciiCharKinds = asciiCharKinds;
}
}
- private RegexBoyerMoore? InitializePrefixBoyerMoore()
- {
- if (_prefix != string.Empty && _prefix.Length <= RegexBoyerMoore.MaxLimit && _prefix.Length > 1)
- {
- // RegexBoyerMoore expects the prefix to be lower case when case is ignored.
- // Use the culture of the matcher.
- string prefix = _isPrefixCaseInsensitive ? _prefix.ToLower(_culture) : _prefix;
- return new RegexBoyerMoore(prefix, _isPrefixCaseInsensitive, rightToLeft: false, _culture);
- }
-
- return null;
- }
-
- private void ConfigureRegexes()
- {
- void Configure(uint i)
- {
- _initialStates[i] = _builder.MkState(_pattern, i);
-
- // Used to detect if initial state was reentered, then startset can be triggered
- // but observe that the behavior from the state may ultimately depend on the previous
- // input char e.g. possibly causing nullability of \b or \B or of a start-of-line anchor,
- // in that sense there can be several "versions" (not more than StateCount) of the initial state.
- _dotstarredInitialStates[i] = _builder.MkState(_dotStarredPattern, i);
- _dotstarredInitialStates[i].IsInitialState = true;
-
- _reverseInitialStates[i] = _builder.MkState(_reversePattern, i);
- }
-
- // Create initial states for A, A1 and Ar.
- if (!_pattern._info.ContainsSomeAnchor)
- {
- // Only the default previous character kind 0 is ever going to be used for all initial states.
- // _A1q0[0] is recognized as special initial state.
- // This information is used for search optimization based on start set and prefix of A.
- Configure(0);
- }
- else
- {
- for (uint i = 0; i < CharKind.CharKindCount; i++)
- {
- Configure(i);
- }
- }
- }
-
- /// <summary>Return the state after the given <paramref name="pattern"/> string from the given state <paramref name="state"/>.</summary>
- private DfaMatchingState<TSetType> DeltaPlus<TTransition>(string pattern, DfaMatchingState<TSetType> state) where TTransition : struct, ITransition
- {
- for (int i = 0; i < pattern.Length; i++)
- {
- state = Delta<TTransition>(pattern, i, state);
- }
-
- return state;
- }
-
/// <summary>Interface for transitions used by the <see cref="Delta"/> method.</summary>
private interface ITransition
{
minterms.Length : // mintermId = minterms.Length represents \Z (last \n)
_partitions.GetMintermID(c);
- TSetType minterm = (uint)mintermId < minterms.Length ?
+ TSetType minterm = (uint)mintermId < (uint)minterms.Length ?
minterms[mintermId] :
_builder._solver.False; // minterm=False represents \Z
private void DoCheckTimeout(int timeoutOccursAt)
{
- // This code is identical to RegexRunner.DoCheckTimeout(),
- // with the exception of check skipping. RegexRunner calls
- // DoCheckTimeout potentially on every iteration of a loop,
- // whereas this calls it only once per transition.
-
+ // This logic is identical to RegexRunner.DoCheckTimeout, with the exception of check skipping. RegexRunner calls
+ // DoCheckTimeout potentially on every iteration of a loop, whereas this calls it only once per transition.
int currentMillis = Environment.TickCount;
-
- if (currentMillis < timeoutOccursAt)
- return;
-
- if (0 > timeoutOccursAt && 0 < currentMillis)
- return;
-
- //regex pattern is in general not available in srm and
- //the input is not available here but could be passed as argument to DoCheckTimeout
- throw new RegexMatchTimeoutException(string.Empty, string.Empty, TimeSpan.FromMilliseconds(_timeout));
+ if (currentMillis >= timeoutOccursAt && (0 <= timeoutOccursAt || 0 >= currentMillis))
+ {
+ throw new RegexMatchTimeoutException(string.Empty, string.Empty, TimeSpan.FromMilliseconds(_timeout));
+ }
}
/// <summary>Find a match.</summary>
/// <param name="isMatch">Whether to return once we know there's a match without determining where exactly it matched.</param>
- /// <param name="input">input string</param>
- /// <param name="startat">the position to start search in the input string</param>
- /// <param name="k">the next position after the end position in the input</param>
- public override SymbolicMatch FindMatch(bool isMatch, string input, int startat, int k)
+ /// <param name="input">The input string</param>
+ /// <param name="startat">The position to start search in the input string.</param>
+ /// <param name="end">The non-inclusive position to end the search in the input string.</param>
+ public SymbolicMatch FindMatch(bool isMatch, string input, int startat, int end)
{
int timeoutOccursAt = 0;
if (_checkTimeout)
timeoutOccursAt = Environment.TickCount + (int)(_timeout + 0.5);
}
- if (startat == k)
+ if (startat == end)
{
- //covers the special case when the remaining input suffix
- //where a match is sought is empty (for example when the input is empty)
- //in this case the only possible match is an empty match
+ // Covers the special-case of an empty match at the end of the input.
uint prevKind = GetCharKind(input, startat - 1);
uint nextKind = GetCharKind(input, startat);
bool emptyMatchExists = _pattern.IsNullableFor(CharKind.Context(prevKind, nextKind));
- return
- !emptyMatchExists ? SymbolicMatch.NoMatch :
- new SymbolicMatch(startat, 0);
+ return emptyMatchExists ?
+ new SymbolicMatch(startat, 0) :
+ SymbolicMatch.NoMatch;
}
// Find the first accepting state. Initial start position in the input is i == 0.
// May return -1 as a legitimate value when the initial state is nullable and startat == 0.
// Returns NoMatchExists when there is no match.
- i = FindFinalStatePosition(input, k, i, timeoutOccursAt, out int i_q0_A1, out int watchdog);
+ i = FindFinalStatePosition(input, end, i, timeoutOccursAt, out int i_q0_A1, out int watchdog);
if (i == NoMatchExists)
{
}
else
{
- if (i < startat)
- {
- Debug.Assert(i == startat - 1);
- i_start = startat;
- }
- else
- {
- // Walk in reverse to locate the start position of the match
- i_start = FindStartPosition(input, i, i_q0_A1);
- }
-
- i_end = FindEndPosition(input, k, i_start);
+ Debug.Assert(i >= startat - 1);
+ i_start = i < startat ?
+ startat :
+ FindStartPosition(input, i, i_q0_A1); // Walk in reverse to locate the start position of the match
+ i_end = FindEndPosition(input, end, i_start);
}
return new SymbolicMatch(i_start, i_end + 1 - i_start);
}
- /// <summary>Find match end position using A, end position is known to exist.</summary>
+ /// <summary>Find match end position using the original pattern, end position is known to exist.</summary>
/// <param name="input">input array</param>
/// <param name="i">inclusive start position</param>
/// <param name="exclusiveEnd">exclusive end position</param>
return i_end;
}
- // Inner loop for FindEndPosition parameterized by an ITransition type.
+ /// <summary>Inner loop for FindEndPosition parameterized by an ITransition type.</summary>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private bool FindEndPositionDeltas<TTransition>(string input, ref int i, int j, ref DfaMatchingState<TSetType> q, ref int i_end) where TTransition : struct, ITransition
{
}
else if (q.IsDeadend)
{
- // Nonaccepting sink state (deadend) has been reached in A.
+ // Non-accepting sink state (deadend) has been reached in the original pattern.
// So the match ended when the last i_end was updated.
return true;
}
return false;
}
- /// <summary>Walk back in reverse using Ar to find the start position of match, start position is known to exist.</summary>
+ /// <summary>Walk back in reverse using the reverse pattern to find the start position of match, start position is known to exist.</summary>
/// <param name="input">the input string</param>
/// <param name="i">position to start walking back from, i points at the last character of the match</param>
/// <param name="match_start_boundary">do not pass this boundary when walking back</param>
/// <returns></returns>
private int FindStartPosition(string input, int i, int match_start_boundary)
{
- // Fetch the correct start state for Ar.
+ // Fetch the correct start state for the reverse pattern.
// This depends on previous character --- which, because going backwards, is character number i+1.
uint prevKind = GetCharKind(input, i + 1);
DfaMatchingState<TSetType> q = _reverseInitialStates[prevKind];
- // Ar may have a fixed prefix sequence
- if (_reversePrefix.Length > 0)
- {
- //skip past the prefix portion of Ar
- q = GetReverseSkipState(prevKind);
- i -= _reversePrefix.Length;
- }
-
if (i == -1)
{
Debug.Assert(q.IsNullable(GetCharKind(input, i)), "we reached the beginning of the input, thus the state q must be accepting");
int last_start = -1;
if (q.IsNullable(GetCharKind(input, i)))
{
- // The whole prefix of Ar was in reverse a prefix of A,
- // for example when the pattern of A is concrete word such as "abc"
+ // The whole prefix of the reverse pattern was in reverse a prefix of the original pattern,
+ // for example when the original pattern is concrete word such as "abc"
last_start = i + 1;
}
- //walk back to the accepting state of Ar
+ // Walk back to the accepting state of the reverse pattern
while (i >= match_start_boundary)
{
int j = Math.Max(match_start_boundary, i - AntimirovThresholdLeeway);
if (q.IsNullable(GetCharKind(input, i - 1)))
{
// Earliest start point so far. This must happen at some point
- // or else A1 would not have reached a final state after match_start_boundary.
+ // or else the dot-star pattern would not have reached a final state after match_start_boundary.
last_start = i;
}
/// <param name="watchdog">length of match when positive</param>
private int FindFinalStatePosition(string input, int k, int i, int timeoutOccursAt, out int initialStateIndex, out int watchdog)
{
- // Get the correct start state of A1, which in general depends on the previous character kind in the input.
+ // Get the correct start state of the dot-star pattern, which in general depends on the previous character kind in the input.
uint prevCharKindId = GetCharKind(input, i - 1);
DfaMatchingState<TSetType> q = _dotstarredInitialStates[prevCharKindId];
initialStateIndex = i;
{
if (q.IsInitialState)
{
- // i_q0_A1 is the most recent position in the input when A1 is in the initial state
+ // i_q0_A1 is the most recent position in the input when the dot-star pattern is in the initial state
initialStateIndex = i;
- if (_prefixBoyerMoore != null)
+ if (_findOpts is RegexFindOptimizations findOpts)
{
- // Stay in the initial state if the prefix does not match.
- // Thus advance the current position to the first position where the prefix does match.
- i = _prefixBoyerMoore.Scan(input, i, 0, input.Length);
-
- if (i == -1) // Scan returns -1 when a matching position does not exist
- {
- watchdog = -1;
- return -2;
- }
-
- // Compute the end state for the A prefix.
- // Skip directly to the resulting state
- // --- i.e. do the loop ---
- // for (int j = 0; j < prefix.Length; j++)
- // q = Delta(prefix[j], q, out regex);
- // ---
- q = GetSkipState(q.PrevCharKind);
-
- // skip the prefix
- i += _prefix.Length;
-
- // here i points at the next character (the character immediately following the prefix)
- if (q.IsNullable(GetCharKind(input, i)))
- {
- // Return the last position of the match
- watchdog = q.WatchDog;
- return i - 1;
- }
-
- if (i == k)
- {
- // no match was found
- return -2;
- }
- }
- else
- {
- // we are still in the initial state, when the prefix is empty
- // find the first position i that matches with some character in the start set
- i = IndexOfStartSet(input, i);
-
- if (i == -1)
+ // Find the first position i that matches with some likely character.
+ if (!findOpts.TryFindNextStartingPosition(input, ref i, 0, 0, k))
{
// no match was found
return NoMatchExists;
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private uint GetCharKind(string input, int i)
{
- if (!_pattern._info.ContainsSomeAnchor)
- {
- // The previous character kind is irrelevant when anchors are not used.
- return CharKind.General;
- }
-
- if (i == -1 || i == input.Length)
- {
- return CharKind.StartStop;
- }
+ return !_pattern._info.ContainsSomeAnchor ?
+ CharKind.General : // The previous character kind is irrelevant when anchors are not used.
+ GetCharKindWithAnchor(input, i);
- char nextChar = input[i];
- if (nextChar == '\n')
+ uint GetCharKindWithAnchor(string input, int i)
{
- return
- _builder._newLinePredicate.Equals(_builder._solver.False) ? 0 : // ignore \n
- i == 0 || i == input.Length - 1 ? CharKind.NewLineS : // very first or very last \n. Detection of very first \n is needed for rev(\Z).
- CharKind.Newline;
- }
-
- uint[] asciiCharKinds = _asciiCharKinds;
- return
- nextChar < asciiCharKinds.Length ? asciiCharKinds[nextChar] :
- _builder._solver.And(GetMinterm(nextChar), _builder._wordLetterPredicateForAnchors).Equals(_builder._solver.False) ? 0 : //apply the wordletter predicate to compute the kind of the next character
- CharKind.WordLetter;
- }
+ Debug.Assert(_asciiCharKinds is not null);
- /// <summary>
- /// Find first occurrence of startset element in input starting from index i.
- /// Startset here is assumed to consist of a few characters.
- /// </summary>
- /// <param name="input">input string to search in</param>
- /// <param name="i">the start index in input to search from</param>
- /// <returns></returns>
- [MethodImpl(MethodImplOptions.AggressiveInlining)]
- private int IndexOfStartSet(string input, int i)
- {
- if (_startSetSize <= StartSetArrayMaxSize)
- {
- return input.IndexOfAny(_startSetArray, i);
- }
+ if ((uint)i >= (uint)input.Length)
+ {
+ return CharKind.StartStop;
+ }
- for (int j = i; j < input.Length; j++)
- {
- if (_startSetClassifier.IsTrue(input[j]))
+ char nextChar = input[i];
+ if (nextChar == '\n')
{
- return j;
+ return
+ _builder._newLinePredicate.Equals(_builder._solver.False) ? 0 : // ignore \n
+ i == 0 || i == input.Length - 1 ? CharKind.NewLineS : // very first or very last \n. Detection of very first \n is needed for rev(\Z).
+ CharKind.Newline;
}
- }
- return -1;
+ uint[] asciiCharKinds = _asciiCharKinds;
+ return
+ nextChar < (uint)asciiCharKinds.Length ? asciiCharKinds[nextChar] :
+ _builder._solver.And(GetMinterm(nextChar), _builder._wordLetterPredicateForAnchors).Equals(_builder._solver.False) ? 0 : //apply the wordletter predicate to compute the kind of the next character
+ CharKind.WordLetter;
+ }
}
#if DEBUG
- public override void SaveDGML(TextWriter writer, int bound, bool hideStateInfo, bool addDotStar, bool inReverse, bool onlyDFAinfo, int maxLabelLength, bool asNFA)
+ public void SaveDGML(TextWriter writer, int bound, bool hideStateInfo, bool addDotStar, bool inReverse, bool onlyDFAinfo, int maxLabelLength, bool asNFA)
{
var graph = new DGML.RegexAutomaton<TSetType>(this, bound, addDotStar, inReverse, asNFA);
var dgml = new DGML.DgmlWriter(writer, hideStateInfo, maxLabelLength, onlyDFAinfo);
dgml.Write(graph);
}
- public override IEnumerable<string> GenerateRandomMembers(int k, int randomseed, bool negative) =>
+ public IEnumerable<string> GenerateRandomMembers(int k, int randomseed, bool negative) =>
new SymbolicRegexSampler<TSetType>(_pattern, randomseed, negative).GenerateRandomMembers(k);
#endif
}
};
}
- /// <summary>
- /// Gets the string prefix that the regex must match or the empty string if such a prefix does not exist.
- /// Sets ignoreCase = true when the prefix works under case-insensitivity.
- /// For example if the input prefix is "---" it sets ignoreCase=false,
- /// if the prefix is "---[aA][bB]" it returns "---AB" and sets ignoreCase=true
- /// </summary>
- internal string GetFixedPrefix(CharSetSolver css, string culture, out bool ignoreCase)
- {
- ignoreCase = false;
- StringBuilder prefix = new();
- bool doneWithoutIgnoreCase = false;
- bool doneWithIgnoreCase = false;
- foreach (S x in GetPrefixSequence())
- {
- BDD bdd = _builder._solver.ConvertToCharSet(css, x);
- char character = (char)bdd.GetMin();
- // Check if the prefix extends without ignore case: the set is a single character
- if (!doneWithoutIgnoreCase && !css.IsSingleton(bdd))
- {
- doneWithoutIgnoreCase = true;
- }
- if (!doneWithIgnoreCase)
- {
- // Check if the prefix extends with ignore case: ignoring case doesn't change the set
- if (css.ApplyIgnoreCase(css.CharConstraint(character), culture).Equals(bdd))
- {
- // Turn ignoreCase on when the prefix extends only under ignore case
- if (doneWithoutIgnoreCase)
- {
- ignoreCase = true;
- }
- }
- else
- {
- doneWithIgnoreCase = true;
- }
- }
- // Append the character when the prefix extends in either of the ways
- if (!doneWithoutIgnoreCase || !doneWithIgnoreCase)
- prefix.Append(character);
- else
- break;
- }
- return prefix.ToString();
- }
-
- private IEnumerable<S> GetPrefixSequence()
- {
- List<SymbolicRegexNode<S>> paths = new();
- HashSet<SymbolicRegexNode<S>> nextPaths = new();
-
- paths.Add(this);
- while (true)
- {
- bool done = false;
- Debug.Assert(paths.Count > 0, "The generator should have ended when any path fails to extend.");
- // Generate the next set from one path
- S next;
- if (!GetNextPrefixSet(ref paths, ref nextPaths, ref done, out next))
- {
- // A path didn't have a next set as supported by this algorithm
- yield break;
- }
- if (!_builder._solver.IsSatisfiable(next))
- {
- yield break;
- }
- while (paths.Count > 0)
- {
- // For all other paths check that they produce the same set
- S newSet;
- if (!GetNextPrefixSet(ref paths, ref nextPaths, ref done, out newSet) || !newSet.Equals(next))
- {
- // Either a path didn't have a next set as supported by this algorithm, or the next set was not equal
- yield break;
- }
- }
- // At this point all paths generated equal next sets
- yield return next;
- if (done)
- {
- // Some path had no continuation, end the prefix
- yield break;
- }
- else
- {
- Debug.Assert(paths.Count == 0, "Not all paths were considered for next set.");
- paths.AddRange(nextPaths);
- nextPaths.Clear();
- }
- }
- }
-
- private bool GetNextPrefixSet(ref List<SymbolicRegexNode<S>> paths, ref HashSet<SymbolicRegexNode<S>> nextPaths, ref bool done, out S set)
- {
- while (paths.Count > 0)
- {
- SymbolicRegexNode<S> node = paths[paths.Count - 1];
- paths.RemoveAt(paths.Count - 1);
- switch (node._kind)
- {
- case SymbolicRegexKind.Singleton:
- Debug.Assert(node._set is not null);
- set = node._set;
- done = true; // No continuation, done after the next set
- return true;
- case SymbolicRegexKind.Concat:
- Debug.Assert(node._left is not null && node._right is not null);
- if (!node._left.CanBeNullable)
- {
- if (node._left.GetFixedLength() == 1)
- {
- set = node._left.GetStartSet();
- // Left side had just one character, can use just right side as path
- nextPaths.Add(node._right);
- return true;
- }
- else
- {
- // Left side may need multiple steps to get through. However, it is safe
- // (though not complete) to forget the right side and just expand the path
- // for the left side.
- paths.Add(node._left);
- break;
- }
- }
- else
- {
- // Left side may be nullable, can't extend the prefix
- set = _builder._solver.False; // Not going to be used
- return false;
- }
- case SymbolicRegexKind.Or:
- case SymbolicRegexKind.And:
- Debug.Assert(node._alts is not null);
- // Handle alternatives as separate paths
- paths.AddRange(node._alts);
- break;
- default:
- set = _builder._solver.False; // Not going to be used
- return false; // Cut prefix immediately for unhandled node
- }
- }
- set = _builder._solver.False; // Not going to be used
- return false;
- }
/// <summary>Get the predicate that covers all elements that make some progress.</summary>
internal S GetStartSet() => _startSet;
/// <summary>The unicode component, including the BDD algebra.</summary>
internal static readonly UnicodeCategoryTheory<BDD> s_unicode = new UnicodeCategoryTheory<BDD>(new CharSetSolver());
- /// <summary>The matching engine.</summary>
- internal readonly SymbolicRegexMatcher _matcher;
- /// <summary>Minimum length computed</summary>
- private readonly int _minRequiredLength;
+ /// <summary>The matching engine, for 64 or fewer minterms. A SymbolicRegexMatcher of ulong or VB</summary>
+ internal readonly ISymbolicRegexMatcher _matcher;
/// <summary>Initializes the factory.</summary>
public SymbolicRegexRunnerFactory(RegexCode code, RegexOptions options, TimeSpan matchTimeout, CultureInfo culture)
var solver = (CharSetSolver)s_unicode._solver;
SymbolicRegexNode<BDD> root = converter.Convert(code.Tree.Root, topLevel: true);
- _minRequiredLength = code.Tree.MinRequiredLength;
-
BDD[] minterms = root.ComputeMinterms();
if (minterms.Length > 64)
{
// Use BV to represent a predicate
var algBV = new BVAlgebra(solver, minterms);
- var builderBV = new SymbolicRegexBuilder<BV>(algBV);
-
- // The default constructor sets the following predicates to False; this update happens after the fact.
- // It depends on whether anchors where used in the regex whether the predicates are actually different from False.
- builderBV._wordLetterPredicateForAnchors = algBV.ConvertFromCharSet(solver, converter._builder._wordLetterPredicateForAnchors);
- builderBV._newLinePredicate = algBV.ConvertFromCharSet(solver, converter._builder._newLinePredicate);
+ var builderBV = new SymbolicRegexBuilder<BV>(algBV)
+ {
+ // The default constructor sets the following predicates to False; this update happens after the fact.
+ // It depends on whether anchors where used in the regex whether the predicates are actually different from False.
+ _wordLetterPredicateForAnchors = algBV.ConvertFromCharSet(solver, converter._builder._wordLetterPredicateForAnchors),
+ _newLinePredicate = algBV.ConvertFromCharSet(solver, converter._builder._newLinePredicate)
+ };
- //Convert the BDD based AST to BV based AST
+ // Convert the BDD-based AST to BV-based AST
SymbolicRegexNode<BV> rootBV = converter._builder.Transform(root, builderBV, bdd => builderBV._solver.ConvertFromCharSet(solver, bdd));
- _matcher = new SymbolicRegexMatcher<BV>(rootBV, solver, minterms, matchTimeout, culture);
+ _matcher = new SymbolicRegexMatcher<BV>(rootBV, code, solver, minterms, matchTimeout, culture);
}
else
{
// Convert the BDD-based AST to ulong-based AST
SymbolicRegexNode<ulong> root64 = converter._builder.Transform(root, builder64, bdd => builder64._solver.ConvertFromCharSet(solver, bdd));
- _matcher = new SymbolicRegexMatcher<ulong>(root64, solver, minterms, matchTimeout, culture);
+ _matcher = new SymbolicRegexMatcher<ulong>(root64, code, solver, minterms, matchTimeout, culture);
}
}
/// <summary>Creates a <see cref="RegexRunner"/> object.</summary>
- protected internal override RegexRunner CreateInstance() => new Runner(_matcher, _minRequiredLength);
+ protected internal override RegexRunner CreateInstance() => _matcher is SymbolicRegexMatcher<ulong> srmUInt64 ?
+ new Runner<ulong>(srmUInt64) :
+ new Runner<BV>((SymbolicRegexMatcher<BV>)_matcher);
/// <summary>Runner type produced by this factory.</summary>
/// <remarks>
- /// The wrapped <see cref="SymbolicRegexMatcher"/> is itself thread-safe and can be shared across
+ /// The wrapped <see cref="ISymbolicRegexMatcher"/> is itself thread-safe and can be shared across
/// all runner instances, but the runner itself has state (e.g. for captures, positions, etc.)
/// and must not be shared between concurrent uses.
/// </remarks>
- private sealed class Runner : RegexRunner
+ private sealed class Runner<TSetType> : RegexRunner where TSetType : notnull
{
/// <summary>The matching engine.</summary>
- private readonly SymbolicRegexMatcher _matcher;
- /// <summary>Minimum length computed.</summary>
- private readonly int _minRequiredLength;
+ private readonly SymbolicRegexMatcher<TSetType> _matcher;
- internal Runner(SymbolicRegexMatcher matcher, int minRequiredLength)
- {
- _matcher = matcher;
- _minRequiredLength = minRequiredLength;
- }
+ internal Runner(SymbolicRegexMatcher<TSetType> matcher) => _matcher = matcher;
protected override void InitTrackCount() { } // nop, no backtracking
- protected override bool FindFirstChar() =>
- // The real logic is all in Go. Here we simply validate if there's enough text remaining to possibly match.
- runtextpos <= runtextend - _minRequiredLength;
+ protected override bool FindFirstChar() => true; // The logic is all in Go.
protected override void Go()
{
}
private BDD ToBDD(S pred) => _solver.ConvertToCharSet(SymbolicRegexRunnerFactory.s_unicode._solver, pred);
+
private T Choose<T>(IList<T> elems) => elems[_random.Next(elems.Count)];
- private T Choose<T>(IEnumerable<T> elems)
- {
- List<T> list = new List<T>(elems);
- return list[_random.Next(list.Count)];
- }
+
private char ChooseChar((uint, uint) pair) => (char)_random.Next((int)pair.Item1, (int)pair.Item2 + 1);
+
private char ChooseChar(BDD bdd)
{
Debug.Assert(!bdd.IsEmpty);
BDD bdd1 = SymbolicRegexRunnerFactory.s_unicode._solver.And(bdd, _ascii);
return ChooseChar(Choose(((CharSetSolver)SymbolicRegexRunnerFactory.s_unicode._solver).ToRanges(bdd1.IsEmpty ? bdd : bdd1)));
}
+
private bool ChooseRandomlyTrueOrFalse() => _random.Next(100) < 50;
/// <summary>Returns true if some state is unconditionally final</summary>
+
private bool IsFinal(IEnumerable<SymbolicRegexNode<S>> states)
{
foreach (SymbolicRegexNode<S> state in states)
}
return false;
}
+
/// <summary>Returns true if some state can be final</summary>
private bool CanBeFinal(IEnumerable<SymbolicRegexNode<S>> states)
{
}
return false;
}
+
/// <summary>Returns true if some state is final in the given context</summary>
private bool IsFinal(IEnumerable<SymbolicRegexNode<S>> states, uint context)
{
}
return false;
}
+
private bool IsWordchar(S pred) => _solver.IsSatisfiable(_solver.And(pred, _root._builder._wordLetterPredicateForAnchors));
+
private bool IsNewline(S pred) => _solver.IsSatisfiable(_solver.And(pred, _root._builder._newLinePredicate));
}
}
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
+using System.Diagnostics.CodeAnalysis;
using System.IO;
namespace System.Text.RegularExpressions.Symbolic.Unicode
{
#if DEBUG
+ [ExcludeFromCodeCoverage]
internal static class GeneratorHelper
{
public static void WriteInt64ArrayInitSyntax(StreamWriter sw, long[] values)
using System.Collections.Generic;
using System.Diagnostics;
+using System.Diagnostics.CodeAnalysis;
using System.Globalization;
using System.IO;
namespace System.Text.RegularExpressions.Symbolic.Unicode
{
#if DEBUG
+ [ExcludeFromCodeCoverage]
internal static class IgnoreCaseRelationGenerator
{
private const string DefaultCultureName = "en-US";
using System.Collections.Generic;
using System.Diagnostics;
+using System.Diagnostics.CodeAnalysis;
using System.Globalization;
using System.IO;
{
#if DEBUG
/// <summary>Utility for generating unicode category ranges and corresponing binary decision diagrams.</summary>
+ [ExcludeFromCodeCoverage]
internal static class UnicodeCategoryRangesGenerator
{
/// <summary>Generator for BDD Unicode category definitions.</summary>
}
/// <summary>Used internally for creating a collection of ranges for serialization.</summary>
+ [ExcludeFromCodeCoverage]
internal sealed class Ranges
{
public readonly List<int[]> ranges = new List<int[]>();
Task.Run(() => func(arg1, arg2, arg3))
.ContinueWith(t => t.GetAwaiter().GetResult(), CancellationToken.None, TaskContinuationOptions.ExecuteSynchronously, TaskScheduler.Default)
.GetAwaiter().GetResult();
+
+ /// <summary>Calls the provided function on the stack of a different thread pool thread.</summary>
+ /// <typeparam name="TArg1">The type of the first argument to pass to the function.</typeparam>
+ /// <typeparam name="TArg2">The type of the second argument to pass to the function.</typeparam>
+ /// <typeparam name="TArg3">The type of the third argument to pass to the function.</typeparam>
+ /// <typeparam name="TArg4">The type of the fourth argument to pass to the function.</typeparam>
+ /// <typeparam name="TResult">The return type of the function.</typeparam>
+ /// <param name="func">The function to invoke.</param>
+ /// <param name="arg1">The first argument to pass to the function.</param>
+ /// <param name="arg2">The second argument to pass to the function.</param>
+ /// <param name="arg3">The third argument to pass to the function.</param>
+ /// <param name="arg4">The fourth argument to pass to the function.</param>
+ public static TResult CallOnEmptyStack<TArg1, TArg2, TArg3, TArg4, TResult>(Func<TArg1, TArg2, TArg3, TArg4, TResult> func, TArg1 arg1, TArg2 arg2, TArg3 arg3, TArg4 arg4) =>
+ Task.Run(() => func(arg1, arg2, arg3, arg4))
+ .ContinueWith(t => t.GetAwaiter().GetResult(), CancellationToken.None, TaskContinuationOptions.ExecuteSynchronously, TaskScheduler.Default)
+ .GetAwaiter().GetResult();
}
}
{
public static IEnumerable<object[]> Groups_Basic_TestData()
{
- // (A - B) B is a subset of A(ie B only contains chars that are in A)
- yield return new object[] { null, "[abcd-[d]]+", "dddaabbccddd", RegexOptions.None, new string[] { "aabbcc" } };
+ foreach (RegexEngine engine in RegexHelpers.AvailableEngines)
+ {
+ // (A - B) B is a subset of A(ie B only contains chars that are in A)
+ yield return new object[] { engine, null, "[abcd-[d]]+", "dddaabbccddd", RegexOptions.None, new string[] { "aabbcc" } };
- yield return new object[] { null, @"[\d-[357]]+", "33312468955", RegexOptions.None, new string[] { "124689" } };
- yield return new object[] { null, @"[\d-[357]]+", "51246897", RegexOptions.None, new string[] { "124689" } };
- yield return new object[] { null, @"[\d-[357]]+", "3312468977", RegexOptions.None, new string[] { "124689" } };
+ yield return new object[] { engine, null, @"[\d-[357]]+", "33312468955", RegexOptions.None, new string[] { "124689" } };
+ yield return new object[] { engine, null, @"[\d-[357]]+", "51246897", RegexOptions.None, new string[] { "124689" } };
+ yield return new object[] { engine, null, @"[\d-[357]]+", "3312468977", RegexOptions.None, new string[] { "124689" } };
- yield return new object[] { null, @"[\w-[b-y]]+", "bbbaaaABCD09zzzyyy", RegexOptions.None, new string[] { "aaaABCD09zzz" } };
+ yield return new object[] { engine, null, @"[\w-[b-y]]+", "bbbaaaABCD09zzzyyy", RegexOptions.None, new string[] { "aaaABCD09zzz" } };
- yield return new object[] { null, @"[\w-[\d]]+", "0AZaz9", RegexOptions.None, new string[] { "AZaz" } };
- yield return new object[] { null, @"[\w-[\p{Ll}]]+", "a09AZz", RegexOptions.None, new string[] { "09AZ" } };
+ yield return new object[] { engine, null, @"[\w-[\d]]+", "0AZaz9", RegexOptions.None, new string[] { "AZaz" } };
+ yield return new object[] { engine, null, @"[\w-[\p{Ll}]]+", "a09AZz", RegexOptions.None, new string[] { "09AZ" } };
- yield return new object[] { null, @"[\d-[13579]]+", "1024689", RegexOptions.ECMAScript, new string[] { "02468" } };
- yield return new object[] { null, @"[\d-[13579]]+", "\x066102468\x0660", RegexOptions.ECMAScript, new string[] { "02468" } };
- yield return new object[] { null, @"[\d-[13579]]+", "\x066102468\x0660", RegexOptions.None, new string[] { "\x066102468\x0660" } };
+ yield return new object[] { engine, null, @"[\d-[13579]]+", "1024689", RegexOptions.ECMAScript, new string[] { "02468" } };
+ yield return new object[] { engine, null, @"[\d-[13579]]+", "\x066102468\x0660", RegexOptions.ECMAScript, new string[] { "02468" } };
+ yield return new object[] { engine, null, @"[\d-[13579]]+", "\x066102468\x0660", RegexOptions.None, new string[] { "\x066102468\x0660" } };
- yield return new object[] { null, @"[\p{Ll}-[ae-z]]+", "aaabbbcccdddeee", RegexOptions.None, new string[] { "bbbcccddd" } };
- yield return new object[] { null, @"[\p{Nd}-[2468]]+", "20135798", RegexOptions.None, new string[] { "013579" } };
+ yield return new object[] { engine, null, @"[\p{Ll}-[ae-z]]+", "aaabbbcccdddeee", RegexOptions.None, new string[] { "bbbcccddd" } };
+ yield return new object[] { engine, null, @"[\p{Nd}-[2468]]+", "20135798", RegexOptions.None, new string[] { "013579" } };
- yield return new object[] { null, @"[\P{Lu}-[ae-z]]+", "aaabbbcccdddeee", RegexOptions.None, new string[] { "bbbcccddd" } };
- yield return new object[] { null, @"[\P{Nd}-[\p{Ll}]]+", "az09AZ'[]", RegexOptions.None, new string[] { "AZ'[]" } };
+ yield return new object[] { engine, null, @"[\P{Lu}-[ae-z]]+", "aaabbbcccdddeee", RegexOptions.None, new string[] { "bbbcccddd" } };
+ yield return new object[] { engine, null, @"[\P{Nd}-[\p{Ll}]]+", "az09AZ'[]", RegexOptions.None, new string[] { "AZ'[]" } };
- // (A - B) B is a superset of A (ie B contains chars that are in A plus other chars that are not in A)
- yield return new object[] { null, "[abcd-[def]]+", "fedddaabbccddd", RegexOptions.None, new string[] { "aabbcc" } };
+ // (A - B) B is a superset of A (ie B contains chars that are in A plus other chars that are not in A)
+ yield return new object[] { engine, null, "[abcd-[def]]+", "fedddaabbccddd", RegexOptions.None, new string[] { "aabbcc" } };
- yield return new object[] { null, @"[\d-[357a-z]]+", "az33312468955", RegexOptions.None, new string[] { "124689" } };
- yield return new object[] { null, @"[\d-[de357fgA-Z]]+", "AZ51246897", RegexOptions.None, new string[] { "124689" } };
- yield return new object[] { null, @"[\d-[357\p{Ll}]]+", "az3312468977", RegexOptions.None, new string[] { "124689" } };
+ yield return new object[] { engine, null, @"[\d-[357a-z]]+", "az33312468955", RegexOptions.None, new string[] { "124689" } };
+ yield return new object[] { engine, null, @"[\d-[de357fgA-Z]]+", "AZ51246897", RegexOptions.None, new string[] { "124689" } };
+ yield return new object[] { engine, null, @"[\d-[357\p{Ll}]]+", "az3312468977", RegexOptions.None, new string[] { "124689" } };
- yield return new object[] { null, @"[\w-[b-y\s]]+", " \tbbbaaaABCD09zzzyyy", RegexOptions.None, new string[] { "aaaABCD09zzz" } };
+ yield return new object[] { engine, null, @"[\w-[b-y\s]]+", " \tbbbaaaABCD09zzzyyy", RegexOptions.None, new string[] { "aaaABCD09zzz" } };
- yield return new object[] { null, @"[\w-[\d\p{Po}]]+", "!#0AZaz9", RegexOptions.None, new string[] { "AZaz" } };
- yield return new object[] { null, @"[\w-[\p{Ll}\s]]+", "a09AZz", RegexOptions.None, new string[] { "09AZ" } };
+ yield return new object[] { engine, null, @"[\w-[\d\p{Po}]]+", "!#0AZaz9", RegexOptions.None, new string[] { "AZaz" } };
+ yield return new object[] { engine, null, @"[\w-[\p{Ll}\s]]+", "a09AZz", RegexOptions.None, new string[] { "09AZ" } };
- yield return new object[] { null, @"[\d-[13579a-zA-Z]]+", "AZ1024689", RegexOptions.ECMAScript, new string[] { "02468" } };
- yield return new object[] { null, @"[\d-[13579abcd]]+", "abcd\x066102468\x0660", RegexOptions.ECMAScript, new string[] { "02468" } };
- yield return new object[] { null, @"[\d-[13579\s]]+", " \t\x066102468\x0660", RegexOptions.None, new string[] { "\x066102468\x0660" } };
+ yield return new object[] { engine, null, @"[\d-[13579a-zA-Z]]+", "AZ1024689", RegexOptions.ECMAScript, new string[] { "02468" } };
+ yield return new object[] { engine, null, @"[\d-[13579abcd]]+", "abcd\x066102468\x0660", RegexOptions.ECMAScript, new string[] { "02468" } };
+ yield return new object[] { engine, null, @"[\d-[13579\s]]+", " \t\x066102468\x0660", RegexOptions.None, new string[] { "\x066102468\x0660" } };
- yield return new object[] { null, @"[\w-[b-y\p{Po}]]+", "!#bbbaaaABCD09zzzyyy", RegexOptions.None, new string[] { "aaaABCD09zzz" } };
+ yield return new object[] { engine, null, @"[\w-[b-y\p{Po}]]+", "!#bbbaaaABCD09zzzyyy", RegexOptions.None, new string[] { "aaaABCD09zzz" } };
- yield return new object[] { null, @"[\w-[b-y!.,]]+", "!.,bbbaaaABCD09zzzyyy", RegexOptions.None, new string[] { "aaaABCD09zzz" } };
- yield return new object[] { null, "[\\w-[b-y\x00-\x0F]]+", "\0bbbaaaABCD09zzzyyy", RegexOptions.None, new string[] { "aaaABCD09zzz" } };
+ yield return new object[] { engine, null, @"[\w-[b-y!.,]]+", "!.,bbbaaaABCD09zzzyyy", RegexOptions.None, new string[] { "aaaABCD09zzz" } };
+ yield return new object[] { engine, null, "[\\w-[b-y\x00-\x0F]]+", "\0bbbaaaABCD09zzzyyy", RegexOptions.None, new string[] { "aaaABCD09zzz" } };
- yield return new object[] { null, @"[\p{Ll}-[ae-z0-9]]+", "09aaabbbcccdddeee", RegexOptions.None, new string[] { "bbbcccddd" } };
- yield return new object[] { null, @"[\p{Nd}-[2468az]]+", "az20135798", RegexOptions.None, new string[] { "013579" } };
+ yield return new object[] { engine, null, @"[\p{Ll}-[ae-z0-9]]+", "09aaabbbcccdddeee", RegexOptions.None, new string[] { "bbbcccddd" } };
+ yield return new object[] { engine, null, @"[\p{Nd}-[2468az]]+", "az20135798", RegexOptions.None, new string[] { "013579" } };
- yield return new object[] { null, @"[\P{Lu}-[ae-zA-Z]]+", "AZaaabbbcccdddeee", RegexOptions.None, new string[] { "bbbcccddd" } };
- yield return new object[] { null, @"[\P{Nd}-[\p{Ll}0123456789]]+", "09az09AZ'[]", RegexOptions.None, new string[] { "AZ'[]" } };
+ yield return new object[] { engine, null, @"[\P{Lu}-[ae-zA-Z]]+", "AZaaabbbcccdddeee", RegexOptions.None, new string[] { "bbbcccddd" } };
+ yield return new object[] { engine, null, @"[\P{Nd}-[\p{Ll}0123456789]]+", "09az09AZ'[]", RegexOptions.None, new string[] { "AZ'[]" } };
- // (A - B) B only contains chars that are not in A
- yield return new object[] { null, "[abc-[defg]]+", "dddaabbccddd", RegexOptions.None, new string[] { "aabbcc" } };
+ // (A - B) B only contains chars that are not in A
+ yield return new object[] { engine, null, "[abc-[defg]]+", "dddaabbccddd", RegexOptions.None, new string[] { "aabbcc" } };
- yield return new object[] { null, @"[\d-[abc]]+", "abc09abc", RegexOptions.None, new string[] { "09" } };
- yield return new object[] { null, @"[\d-[a-zA-Z]]+", "az09AZ", RegexOptions.None, new string[] { "09" } };
- yield return new object[] { null, @"[\d-[\p{Ll}]]+", "az09az", RegexOptions.None, new string[] { "09" } };
+ yield return new object[] { engine, null, @"[\d-[abc]]+", "abc09abc", RegexOptions.None, new string[] { "09" } };
+ yield return new object[] { engine, null, @"[\d-[a-zA-Z]]+", "az09AZ", RegexOptions.None, new string[] { "09" } };
+ yield return new object[] { engine, null, @"[\d-[\p{Ll}]]+", "az09az", RegexOptions.None, new string[] { "09" } };
- yield return new object[] { null, @"[\w-[\x00-\x0F]]+", "bbbaaaABYZ09zzzyyy", RegexOptions.None, new string[] { "bbbaaaABYZ09zzzyyy" } };
+ yield return new object[] { engine, null, @"[\w-[\x00-\x0F]]+", "bbbaaaABYZ09zzzyyy", RegexOptions.None, new string[] { "bbbaaaABYZ09zzzyyy" } };
- yield return new object[] { null, @"[\w-[\s]]+", "0AZaz9", RegexOptions.None, new string[] { "0AZaz9" } };
- yield return new object[] { null, @"[\w-[\W]]+", "0AZaz9", RegexOptions.None, new string[] { "0AZaz9" } };
- yield return new object[] { null, @"[\w-[\p{Po}]]+", "#a09AZz!", RegexOptions.None, new string[] { "a09AZz" } };
+ yield return new object[] { engine, null, @"[\w-[\s]]+", "0AZaz9", RegexOptions.None, new string[] { "0AZaz9" } };
+ yield return new object[] { engine, null, @"[\w-[\W]]+", "0AZaz9", RegexOptions.None, new string[] { "0AZaz9" } };
+ yield return new object[] { engine, null, @"[\w-[\p{Po}]]+", "#a09AZz!", RegexOptions.None, new string[] { "a09AZz" } };
- yield return new object[] { null, @"[\d-[\D]]+", "azAZ1024689", RegexOptions.ECMAScript, new string[] { "1024689" } };
- yield return new object[] { null, @"[\d-[a-zA-Z]]+", "azAZ\x066102468\x0660", RegexOptions.ECMAScript, new string[] { "02468" } };
- yield return new object[] { null, @"[\d-[\p{Ll}]]+", "\x066102468\x0660", RegexOptions.None, new string[] { "\x066102468\x0660" } };
+ yield return new object[] { engine, null, @"[\d-[\D]]+", "azAZ1024689", RegexOptions.ECMAScript, new string[] { "1024689" } };
+ yield return new object[] { engine, null, @"[\d-[a-zA-Z]]+", "azAZ\x066102468\x0660", RegexOptions.ECMAScript, new string[] { "02468" } };
+ yield return new object[] { engine, null, @"[\d-[\p{Ll}]]+", "\x066102468\x0660", RegexOptions.None, new string[] { "\x066102468\x0660" } };
- yield return new object[] { null, @"[a-zA-Z0-9-[\s]]+", " \tazAZ09", RegexOptions.None, new string[] { "azAZ09" } };
+ yield return new object[] { engine, null, @"[a-zA-Z0-9-[\s]]+", " \tazAZ09", RegexOptions.None, new string[] { "azAZ09" } };
- yield return new object[] { null, @"[a-zA-Z0-9-[\W]]+", "bbbaaaABCD09zzzyyy", RegexOptions.None, new string[] { "bbbaaaABCD09zzzyyy" } };
- yield return new object[] { null, @"[a-zA-Z0-9-[^a-zA-Z0-9]]+", "bbbaaaABCD09zzzyyy", RegexOptions.None, new string[] { "bbbaaaABCD09zzzyyy" } };
+ yield return new object[] { engine, null, @"[a-zA-Z0-9-[\W]]+", "bbbaaaABCD09zzzyyy", RegexOptions.None, new string[] { "bbbaaaABCD09zzzyyy" } };
+ yield return new object[] { engine, null, @"[a-zA-Z0-9-[^a-zA-Z0-9]]+", "bbbaaaABCD09zzzyyy", RegexOptions.None, new string[] { "bbbaaaABCD09zzzyyy" } };
- yield return new object[] { null, @"[\p{Ll}-[A-Z]]+", "AZaz09", RegexOptions.None, new string[] { "az" } };
- yield return new object[] { null, @"[\p{Nd}-[a-z]]+", "az09", RegexOptions.None, new string[] { "09" } };
+ yield return new object[] { engine, null, @"[\p{Ll}-[A-Z]]+", "AZaz09", RegexOptions.None, new string[] { "az" } };
+ yield return new object[] { engine, null, @"[\p{Nd}-[a-z]]+", "az09", RegexOptions.None, new string[] { "09" } };
- yield return new object[] { null, @"[\P{Lu}-[\p{Lu}]]+", "AZazAZ", RegexOptions.None, new string[] { "az" } };
- yield return new object[] { null, @"[\P{Lu}-[A-Z]]+", "AZazAZ", RegexOptions.None, new string[] { "az" } };
- yield return new object[] { null, @"[\P{Nd}-[\p{Nd}]]+", "azAZ09", RegexOptions.None, new string[] { "azAZ" } };
- yield return new object[] { null, @"[\P{Nd}-[2-8]]+", "1234567890azAZ1234567890", RegexOptions.None, new string[] { "azAZ" } };
+ yield return new object[] { engine, null, @"[\P{Lu}-[\p{Lu}]]+", "AZazAZ", RegexOptions.None, new string[] { "az" } };
+ yield return new object[] { engine, null, @"[\P{Lu}-[A-Z]]+", "AZazAZ", RegexOptions.None, new string[] { "az" } };
+ yield return new object[] { engine, null, @"[\P{Nd}-[\p{Nd}]]+", "azAZ09", RegexOptions.None, new string[] { "azAZ" } };
+ yield return new object[] { engine, null, @"[\P{Nd}-[2-8]]+", "1234567890azAZ1234567890", RegexOptions.None, new string[] { "azAZ" } };
- // Alternating construct
- yield return new object[] { null, @"([ ]|[\w-[0-9]])+", "09az AZ90", RegexOptions.None, new string[] { "az AZ", "Z" } };
- yield return new object[] { null, @"([0-9-[02468]]|[0-9-[13579]])+", "az1234567890za", RegexOptions.None, new string[] { "1234567890", "0" } };
- yield return new object[] { null, @"([^0-9-[a-zAE-Z]]|[\w-[a-zAF-Z]])+", "azBCDE1234567890BCDEFza", RegexOptions.None, new string[] { "BCDE1234567890BCDE", "E" } };
- yield return new object[] { null, @"([\p{Ll}-[aeiou]]|[^\w-[\s]])+", "aeiobcdxyz!@#aeio", RegexOptions.None, new string[] { "bcdxyz!@#", "#" } };
- yield return new object[] { null, @"(?:hello|hi){1,3}", "hello", RegexOptions.None, new string[] { "hello" } };
- yield return new object[] { null, @"(hello|hi){1,3}", "hellohihey", RegexOptions.None, new string[] { "hellohi", "hi" } };
- yield return new object[] { null, @"(?:hello|hi){1,3}", "hellohihey", RegexOptions.None, new string[] { "hellohi" } };
- yield return new object[] { null, @"(?:hello|hi){2,2}", "hellohihey", RegexOptions.None, new string[] { "hellohi" } };
- yield return new object[] { null, @"(?:hello|hi){2,2}?", "hellohihihello", RegexOptions.None, new string[] { "hellohi" } };
- yield return new object[] { null, @"(?:abc|def|ghi|hij|klm|no){1,4}", "this is a test nonoabcxyz this is only a test", RegexOptions.None, new string[] { "nonoabc" } };
- yield return new object[] { null, @"xyz(abc|def)xyz", "abcxyzdefxyzabc", RegexOptions.None, new string[] { "xyzdefxyz", "def" } };
- yield return new object[] { null, @"abc|(?:def|ghi)", "ghi", RegexOptions.None, new string[] { "ghi" } };
- yield return new object[] { null, @"abc|(def|ghi)", "def", RegexOptions.None, new string[] { "def", "def" } };
+ // Alternating construct
+ yield return new object[] { engine, null, @"([ ]|[\w-[0-9]])+", "09az AZ90", RegexOptions.None, new string[] { "az AZ", "Z" } };
+ yield return new object[] { engine, null, @"([0-9-[02468]]|[0-9-[13579]])+", "az1234567890za", RegexOptions.None, new string[] { "1234567890", "0" } };
+ yield return new object[] { engine, null, @"([^0-9-[a-zAE-Z]]|[\w-[a-zAF-Z]])+", "azBCDE1234567890BCDEFza", RegexOptions.None, new string[] { "BCDE1234567890BCDE", "E" } };
+ yield return new object[] { engine, null, @"([\p{Ll}-[aeiou]]|[^\w-[\s]])+", "aeiobcdxyz!@#aeio", RegexOptions.None, new string[] { "bcdxyz!@#", "#" } };
+ yield return new object[] { engine, null, @"(?:hello|hi){1,3}", "hello", RegexOptions.None, new string[] { "hello" } };
+ yield return new object[] { engine, null, @"(hello|hi){1,3}", "hellohihey", RegexOptions.None, new string[] { "hellohi", "hi" } };
+ yield return new object[] { engine, null, @"(?:hello|hi){1,3}", "hellohihey", RegexOptions.None, new string[] { "hellohi" } };
+ yield return new object[] { engine, null, @"(?:hello|hi){2,2}", "hellohihey", RegexOptions.None, new string[] { "hellohi" } };
+ yield return new object[] { engine, null, @"(?:hello|hi){2,2}?", "hellohihihello", RegexOptions.None, new string[] { "hellohi" } };
+ yield return new object[] { engine, null, @"(?:abc|def|ghi|hij|klm|no){1,4}", "this is a test nonoabcxyz this is only a test", RegexOptions.None, new string[] { "nonoabc" } };
+ yield return new object[] { engine, null, @"xyz(abc|def)xyz", "abcxyzdefxyzabc", RegexOptions.None, new string[] { "xyzdefxyz", "def" } };
+ yield return new object[] { engine, null, @"abc|(?:def|ghi)", "ghi", RegexOptions.None, new string[] { "ghi" } };
+ yield return new object[] { engine, null, @"abc|(def|ghi)", "def", RegexOptions.None, new string[] { "def", "def" } };
- // Multiple character classes using character class subtraction
- yield return new object[] { null, @"98[\d-[9]][\d-[8]][\d-[0]]", "98911 98881 98870 98871", RegexOptions.None, new string[] { "98871" } };
- yield return new object[] { null, @"m[\w-[^aeiou]][\w-[^aeiou]]t", "mbbt mect meet", RegexOptions.None, new string[] { "meet" } };
+ // Multiple character classes using character class subtraction
+ yield return new object[] { engine, null, @"98[\d-[9]][\d-[8]][\d-[0]]", "98911 98881 98870 98871", RegexOptions.None, new string[] { "98871" } };
+ yield return new object[] { engine, null, @"m[\w-[^aeiou]][\w-[^aeiou]]t", "mbbt mect meet", RegexOptions.None, new string[] { "meet" } };
- // Negation with character class subtraction
- yield return new object[] { null, "[abcdef-[^bce]]+", "adfbcefda", RegexOptions.None, new string[] { "bce" } };
- yield return new object[] { null, "[^cde-[ag]]+", "agbfxyzga", RegexOptions.None, new string[] { "bfxyz" } };
+ // Negation with character class subtraction
+ yield return new object[] { engine, null, "[abcdef-[^bce]]+", "adfbcefda", RegexOptions.None, new string[] { "bce" } };
+ yield return new object[] { engine, null, "[^cde-[ag]]+", "agbfxyzga", RegexOptions.None, new string[] { "bfxyz" } };
- // Misc The idea here is come up with real world examples of char class subtraction. Things that
- // would be difficult to define without it
- yield return new object[] { null, @"[\p{L}-[^\p{Lu}]]+", "09',.abcxyzABCXYZ", RegexOptions.None, new string[] { "ABCXYZ" } };
+ // Misc The idea here is come up with real world examples of char class subtraction. Things that
+ // would be difficult to define without it
+ yield return new object[] { engine, null, @"[\p{L}-[^\p{Lu}]]+", "09',.abcxyzABCXYZ", RegexOptions.None, new string[] { "ABCXYZ" } };
- yield return new object[] { null, @"[\p{IsGreek}-[\P{Lu}]]+", "\u0390\u03FE\u0386\u0388\u03EC\u03EE\u0400", RegexOptions.None, new string[] { "\u03FE\u0386\u0388\u03EC\u03EE" } };
- yield return new object[] { null, @"[\p{IsBasicLatin}-[G-L]]+", "GAFMZL", RegexOptions.None, new string[] { "AFMZ" } };
+ yield return new object[] { engine, null, @"[\p{IsGreek}-[\P{Lu}]]+", "\u0390\u03FE\u0386\u0388\u03EC\u03EE\u0400", RegexOptions.None, new string[] { "\u03FE\u0386\u0388\u03EC\u03EE" } };
+ yield return new object[] { engine, null, @"[\p{IsBasicLatin}-[G-L]]+", "GAFMZL", RegexOptions.None, new string[] { "AFMZ" } };
- yield return new object[] { null, "[a-zA-Z-[aeiouAEIOU]]+", "aeiouAEIOUbcdfghjklmnpqrstvwxyz", RegexOptions.None, new string[] { "bcdfghjklmnpqrstvwxyz" } };
+ yield return new object[] { engine, null, "[a-zA-Z-[aeiouAEIOU]]+", "aeiouAEIOUbcdfghjklmnpqrstvwxyz", RegexOptions.None, new string[] { "bcdfghjklmnpqrstvwxyz" } };
- // The following is an overly complex way of matching an ip address using char class subtraction
- yield return new object[] { null, @"^
+ // The following is an overly complex way of matching an ip address using char class subtraction
+ yield return new object[] { engine, null, @"^
(?<octet>^
(
(
)$"
, "255", RegexOptions.IgnorePatternWhitespace, new string[] { "255", "255", "2", "5", "5", "", "255", "2", "5" } };
- // Character Class Substraction
- yield return new object[] { null, @"[abcd\-d-[bc]]+", "bbbaaa---dddccc", RegexOptions.None, new string[] { "aaa---ddd" } };
- yield return new object[] { null, @"[^a-f-[\x00-\x60\u007B-\uFFFF]]+", "aaafffgggzzz{{{", RegexOptions.None, new string[] { "gggzzz" } };
- yield return new object[] { null, @"[\[\]a-f-[[]]+", "gggaaafff]]][[[", RegexOptions.None, new string[] { "aaafff]]]" } };
- yield return new object[] { null, @"[\[\]a-f-[]]]+", "gggaaafff[[[]]]", RegexOptions.None, new string[] { "aaafff[[[" } };
-
- yield return new object[] { null, @"[ab\-\[cd-[-[]]]]", "a]]", RegexOptions.None, new string[] { "a]]" } };
- yield return new object[] { null, @"[ab\-\[cd-[-[]]]]", "b]]", RegexOptions.None, new string[] { "b]]" } };
- yield return new object[] { null, @"[ab\-\[cd-[-[]]]]", "c]]", RegexOptions.None, new string[] { "c]]" } };
- yield return new object[] { null, @"[ab\-\[cd-[-[]]]]", "d]]", RegexOptions.None, new string[] { "d]]" } };
-
- yield return new object[] { null, @"[ab\-\[cd-[[]]]]", "a]]", RegexOptions.None, new string[] { "a]]" } };
- yield return new object[] { null, @"[ab\-\[cd-[[]]]]", "b]]", RegexOptions.None, new string[] { "b]]" } };
- yield return new object[] { null, @"[ab\-\[cd-[[]]]]", "c]]", RegexOptions.None, new string[] { "c]]" } };
- yield return new object[] { null, @"[ab\-\[cd-[[]]]]", "d]]", RegexOptions.None, new string[] { "d]]" } };
- yield return new object[] { null, @"[ab\-\[cd-[[]]]]", "-]]", RegexOptions.None, new string[] { "-]]" } };
-
- yield return new object[] { null, @"[a-[c-e]]+", "bbbaaaccc", RegexOptions.None, new string[] { "aaa" } };
- yield return new object[] { null, @"[a-[c-e]]+", "```aaaccc", RegexOptions.None, new string[] { "aaa" } };
-
- yield return new object[] { null, @"[a-d\--[bc]]+", "cccaaa--dddbbb", RegexOptions.None, new string[] { "aaa--ddd" } };
-
- // Not Character class substraction
- yield return new object[] { null, @"[\0- [bc]+", "!!!\0\0\t\t [[[[bbbcccaaa", RegexOptions.None, new string[] { "\0\0\t\t [[[[bbbccc" } };
- yield return new object[] { null, "[[abcd]-[bc]]+", "a-b]", RegexOptions.None, new string[] { "a-b]" } };
- yield return new object[] { null, "[-[e-g]+", "ddd[[[---eeefffggghhh", RegexOptions.None, new string[] { "[[[---eeefffggg" } };
- yield return new object[] { null, "[-e-g]+", "ddd---eeefffggghhh", RegexOptions.None, new string[] { "---eeefffggg" } };
- yield return new object[] { null, "[a-e - m-p]+", "---a b c d e m n o p---", RegexOptions.None, new string[] { "a b c d e m n o p" } };
- yield return new object[] { null, "[^-[bc]]", "b] c] -] aaaddd]", RegexOptions.None, new string[] { "d]" } };
- yield return new object[] { null, "[^-[bc]]", "b] c] -] aaa]ddd]", RegexOptions.None, new string[] { "a]" } };
-
- // Make sure we correctly handle \-
- yield return new object[] { null, @"[a\-[bc]+", "```bbbaaa---[[[cccddd", RegexOptions.None, new string[] { "bbbaaa---[[[ccc" } };
- yield return new object[] { null, @"[a\-[\-\-bc]+", "```bbbaaa---[[[cccddd", RegexOptions.None, new string[] { "bbbaaa---[[[ccc" } };
- yield return new object[] { null, @"[a\-\[\-\[\-bc]+", "```bbbaaa---[[[cccddd", RegexOptions.None, new string[] { "bbbaaa---[[[ccc" } };
- yield return new object[] { null, @"[abc\--[b]]+", "[[[```bbbaaa---cccddd", RegexOptions.None, new string[] { "aaa---ccc" } };
- yield return new object[] { null, @"[abc\-z-[b]]+", "```aaaccc---zzzbbb", RegexOptions.None, new string[] { "aaaccc---zzz" } };
- yield return new object[] { null, @"[a-d\-[b]+", "```aaabbbcccddd----[[[[]]]", RegexOptions.None, new string[] { "aaabbbcccddd----[[[[" } };
- yield return new object[] { null, @"[abcd\-d\-[bc]+", "bbbaaa---[[[dddccc", RegexOptions.None, new string[] { "bbbaaa---[[[dddccc" } };
-
- // Everything works correctly with option RegexOptions.IgnorePatternWhitespace
- yield return new object[] { null, "[a - c - [ b ] ]+", "dddaaa ccc [[[[ bbb ]]]", RegexOptions.IgnorePatternWhitespace, new string[] { " ]]]" } };
- yield return new object[] { null, "[a - c - [ b ] +", "dddaaa ccc [[[[ bbb ]]]", RegexOptions.IgnorePatternWhitespace, new string[] { "aaa ccc [[[[ bbb " } };
-
- // Unicode Char Classes
- yield return new object[] { null, @"(\p{Lu}\w*)\s(\p{Lu}\w*)", "Hello World", RegexOptions.None, new string[] { "Hello World", "Hello", "World" } };
- yield return new object[] { null, @"(\p{Lu}\p{Ll}*)\s(\p{Lu}\p{Ll}*)", "Hello World", RegexOptions.None, new string[] { "Hello World", "Hello", "World" } };
- yield return new object[] { null, @"(\P{Ll}\p{Ll}*)\s(\P{Ll}\p{Ll}*)", "Hello World", RegexOptions.None, new string[] { "Hello World", "Hello", "World" } };
- yield return new object[] { null, @"(\P{Lu}+\p{Lu})\s(\P{Lu}+\p{Lu})", "hellO worlD", RegexOptions.None, new string[] { "hellO worlD", "hellO", "worlD" } };
- yield return new object[] { null, @"(\p{Lt}\w*)\s(\p{Lt}*\w*)", "\u01C5ello \u01C5orld", RegexOptions.None, new string[] { "\u01C5ello \u01C5orld", "\u01C5ello", "\u01C5orld" } };
- yield return new object[] { null, @"(\P{Lt}\w*)\s(\P{Lt}*\w*)", "Hello World", RegexOptions.None, new string[] { "Hello World", "Hello", "World" } };
-
- // Character ranges IgnoreCase
- yield return new object[] { null, @"[@-D]+", "eE?@ABCDabcdeE", RegexOptions.IgnoreCase, new string[] { "@ABCDabcd" } };
- yield return new object[] { null, @"[>-D]+", "eE=>?@ABCDabcdeE", RegexOptions.IgnoreCase, new string[] { ">?@ABCDabcd" } };
- yield return new object[] { null, @"[\u0554-\u0557]+", "\u0583\u0553\u0554\u0555\u0556\u0584\u0585\u0586\u0557\u0558", RegexOptions.IgnoreCase, new string[] { "\u0554\u0555\u0556\u0584\u0585\u0586\u0557" } };
- yield return new object[] { null, @"[X-\]]+", "wWXYZxyz[\\]^", RegexOptions.IgnoreCase, new string[] { "XYZxyz[\\]" } };
- yield return new object[] { null, @"[X-\u0533]+", "\u0551\u0554\u0560AXYZaxyz\u0531\u0532\u0533\u0561\u0562\u0563\u0564", RegexOptions.IgnoreCase, new string[] { "AXYZaxyz\u0531\u0532\u0533\u0561\u0562\u0563" } };
- yield return new object[] { null, @"[X-a]+", "wWAXYZaxyz", RegexOptions.IgnoreCase, new string[] { "AXYZaxyz" } };
- yield return new object[] { null, @"[X-c]+", "wWABCXYZabcxyz", RegexOptions.IgnoreCase, new string[] { "ABCXYZabcxyz" } };
- yield return new object[] { null, @"[X-\u00C0]+", "\u00C1\u00E1\u00C0\u00E0wWABCXYZabcxyz", RegexOptions.IgnoreCase, new string[] { "\u00C0\u00E0wWABCXYZabcxyz" } };
- yield return new object[] { null, @"[\u0100\u0102\u0104]+", "\u00FF \u0100\u0102\u0104\u0101\u0103\u0105\u0106", RegexOptions.IgnoreCase, new string[] { "\u0100\u0102\u0104\u0101\u0103\u0105" } };
- yield return new object[] { null, @"[B-D\u0130]+", "aAeE\u0129\u0131\u0068 BCDbcD\u0130\u0069\u0070", RegexOptions.IgnoreCase, new string[] { "BCDbcD\u0130\u0069" } };
- yield return new object[] { null, @"[\u013B\u013D\u013F]+", "\u013A\u013B\u013D\u013F\u013C\u013E\u0140\u0141", RegexOptions.IgnoreCase, new string[] { "\u013B\u013D\u013F\u013C\u013E\u0140" } };
-
- // Escape Chars
- yield return new object[] { null, "(Cat)\r(Dog)", "Cat\rDog", RegexOptions.None, new string[] { "Cat\rDog", "Cat", "Dog" } };
- yield return new object[] { null, "(Cat)\t(Dog)", "Cat\tDog", RegexOptions.None, new string[] { "Cat\tDog", "Cat", "Dog" } };
- yield return new object[] { null, "(Cat)\f(Dog)", "Cat\fDog", RegexOptions.None, new string[] { "Cat\fDog", "Cat", "Dog" } };
-
- // Miscellaneous { witout matching }
- yield return new object[] { null, @"{5", "hello {5 world", RegexOptions.None, new string[] { "{5" } };
- yield return new object[] { null, @"{5,", "hello {5, world", RegexOptions.None, new string[] { "{5," } };
- yield return new object[] { null, @"{5,6", "hello {5,6 world", RegexOptions.None, new string[] { "{5,6" } };
-
- // Miscellaneous inline options
- yield return new object[] { null, @"(?n:(?<cat>cat)(\s+)(?<dog>dog))", "cat dog", RegexOptions.None, new string[] { "cat dog", "cat", "dog" } };
- yield return new object[] { null, @"(?n:(cat)(\s+)(dog))", "cat dog", RegexOptions.None, new string[] { "cat dog" } };
- yield return new object[] { null, @"(?n:(cat)(?<SpaceChars>\s+)(dog))", "cat dog", RegexOptions.None, new string[] { "cat dog", " " } };
- yield return new object[] { null, @"(?x:
+ // Character Class Substraction
+ yield return new object[] { engine, null, @"[abcd\-d-[bc]]+", "bbbaaa---dddccc", RegexOptions.None, new string[] { "aaa---ddd" } };
+ yield return new object[] { engine, null, @"[^a-f-[\x00-\x60\u007B-\uFFFF]]+", "aaafffgggzzz{{{", RegexOptions.None, new string[] { "gggzzz" } };
+ yield return new object[] { engine, null, @"[\[\]a-f-[[]]+", "gggaaafff]]][[[", RegexOptions.None, new string[] { "aaafff]]]" } };
+ yield return new object[] { engine, null, @"[\[\]a-f-[]]]+", "gggaaafff[[[]]]", RegexOptions.None, new string[] { "aaafff[[[" } };
+
+ yield return new object[] { engine, null, @"[ab\-\[cd-[-[]]]]", "a]]", RegexOptions.None, new string[] { "a]]" } };
+ yield return new object[] { engine, null, @"[ab\-\[cd-[-[]]]]", "b]]", RegexOptions.None, new string[] { "b]]" } };
+ yield return new object[] { engine, null, @"[ab\-\[cd-[-[]]]]", "c]]", RegexOptions.None, new string[] { "c]]" } };
+ yield return new object[] { engine, null, @"[ab\-\[cd-[-[]]]]", "d]]", RegexOptions.None, new string[] { "d]]" } };
+
+ yield return new object[] { engine, null, @"[ab\-\[cd-[[]]]]", "a]]", RegexOptions.None, new string[] { "a]]" } };
+ yield return new object[] { engine, null, @"[ab\-\[cd-[[]]]]", "b]]", RegexOptions.None, new string[] { "b]]" } };
+ yield return new object[] { engine, null, @"[ab\-\[cd-[[]]]]", "c]]", RegexOptions.None, new string[] { "c]]" } };
+ yield return new object[] { engine, null, @"[ab\-\[cd-[[]]]]", "d]]", RegexOptions.None, new string[] { "d]]" } };
+ yield return new object[] { engine, null, @"[ab\-\[cd-[[]]]]", "-]]", RegexOptions.None, new string[] { "-]]" } };
+
+ yield return new object[] { engine, null, @"[a-[c-e]]+", "bbbaaaccc", RegexOptions.None, new string[] { "aaa" } };
+ yield return new object[] { engine, null, @"[a-[c-e]]+", "```aaaccc", RegexOptions.None, new string[] { "aaa" } };
+
+ yield return new object[] { engine, null, @"[a-d\--[bc]]+", "cccaaa--dddbbb", RegexOptions.None, new string[] { "aaa--ddd" } };
+
+ // Not Character class substraction
+ yield return new object[] { engine, null, @"[\0- [bc]+", "!!!\0\0\t\t [[[[bbbcccaaa", RegexOptions.None, new string[] { "\0\0\t\t [[[[bbbccc" } };
+ yield return new object[] { engine, null, "[[abcd]-[bc]]+", "a-b]", RegexOptions.None, new string[] { "a-b]" } };
+ yield return new object[] { engine, null, "[-[e-g]+", "ddd[[[---eeefffggghhh", RegexOptions.None, new string[] { "[[[---eeefffggg" } };
+ yield return new object[] { engine, null, "[-e-g]+", "ddd---eeefffggghhh", RegexOptions.None, new string[] { "---eeefffggg" } };
+ yield return new object[] { engine, null, "[a-e - m-p]+", "---a b c d e m n o p---", RegexOptions.None, new string[] { "a b c d e m n o p" } };
+ yield return new object[] { engine, null, "[^-[bc]]", "b] c] -] aaaddd]", RegexOptions.None, new string[] { "d]" } };
+ yield return new object[] { engine, null, "[^-[bc]]", "b] c] -] aaa]ddd]", RegexOptions.None, new string[] { "a]" } };
+
+ // Make sure we correctly handle \-
+ yield return new object[] { engine, null, @"[a\-[bc]+", "```bbbaaa---[[[cccddd", RegexOptions.None, new string[] { "bbbaaa---[[[ccc" } };
+ yield return new object[] { engine, null, @"[a\-[\-\-bc]+", "```bbbaaa---[[[cccddd", RegexOptions.None, new string[] { "bbbaaa---[[[ccc" } };
+ yield return new object[] { engine, null, @"[a\-\[\-\[\-bc]+", "```bbbaaa---[[[cccddd", RegexOptions.None, new string[] { "bbbaaa---[[[ccc" } };
+ yield return new object[] { engine, null, @"[abc\--[b]]+", "[[[```bbbaaa---cccddd", RegexOptions.None, new string[] { "aaa---ccc" } };
+ yield return new object[] { engine, null, @"[abc\-z-[b]]+", "```aaaccc---zzzbbb", RegexOptions.None, new string[] { "aaaccc---zzz" } };
+ yield return new object[] { engine, null, @"[a-d\-[b]+", "```aaabbbcccddd----[[[[]]]", RegexOptions.None, new string[] { "aaabbbcccddd----[[[[" } };
+ yield return new object[] { engine, null, @"[abcd\-d\-[bc]+", "bbbaaa---[[[dddccc", RegexOptions.None, new string[] { "bbbaaa---[[[dddccc" } };
+
+ // Everything works correctly with option RegexOptions.IgnorePatternWhitespace
+ yield return new object[] { engine, null, "[a - c - [ b ] ]+", "dddaaa ccc [[[[ bbb ]]]", RegexOptions.IgnorePatternWhitespace, new string[] { " ]]]" } };
+ yield return new object[] { engine, null, "[a - c - [ b ] +", "dddaaa ccc [[[[ bbb ]]]", RegexOptions.IgnorePatternWhitespace, new string[] { "aaa ccc [[[[ bbb " } };
+
+ // Unicode Char Classes
+ yield return new object[] { engine, null, @"(\p{Lu}\w*)\s(\p{Lu}\w*)", "Hello World", RegexOptions.None, new string[] { "Hello World", "Hello", "World" } };
+ yield return new object[] { engine, null, @"(\p{Lu}\p{Ll}*)\s(\p{Lu}\p{Ll}*)", "Hello World", RegexOptions.None, new string[] { "Hello World", "Hello", "World" } };
+ yield return new object[] { engine, null, @"(\P{Ll}\p{Ll}*)\s(\P{Ll}\p{Ll}*)", "Hello World", RegexOptions.None, new string[] { "Hello World", "Hello", "World" } };
+ yield return new object[] { engine, null, @"(\P{Lu}+\p{Lu})\s(\P{Lu}+\p{Lu})", "hellO worlD", RegexOptions.None, new string[] { "hellO worlD", "hellO", "worlD" } };
+ yield return new object[] { engine, null, @"(\p{Lt}\w*)\s(\p{Lt}*\w*)", "\u01C5ello \u01C5orld", RegexOptions.None, new string[] { "\u01C5ello \u01C5orld", "\u01C5ello", "\u01C5orld" } };
+ yield return new object[] { engine, null, @"(\P{Lt}\w*)\s(\P{Lt}*\w*)", "Hello World", RegexOptions.None, new string[] { "Hello World", "Hello", "World" } };
+
+ // Character ranges IgnoreCase
+ yield return new object[] { engine, null, @"[@-D]+", "eE?@ABCDabcdeE", RegexOptions.IgnoreCase, new string[] { "@ABCDabcd" } };
+ yield return new object[] { engine, null, @"[>-D]+", "eE=>?@ABCDabcdeE", RegexOptions.IgnoreCase, new string[] { ">?@ABCDabcd" } };
+ yield return new object[] { engine, null, @"[\u0554-\u0557]+", "\u0583\u0553\u0554\u0555\u0556\u0584\u0585\u0586\u0557\u0558", RegexOptions.IgnoreCase, new string[] { "\u0554\u0555\u0556\u0584\u0585\u0586\u0557" } };
+ yield return new object[] { engine, null, @"[X-\]]+", "wWXYZxyz[\\]^", RegexOptions.IgnoreCase, new string[] { "XYZxyz[\\]" } };
+ yield return new object[] { engine, null, @"[X-\u0533]+", "\u0551\u0554\u0560AXYZaxyz\u0531\u0532\u0533\u0561\u0562\u0563\u0564", RegexOptions.IgnoreCase, new string[] { "AXYZaxyz\u0531\u0532\u0533\u0561\u0562\u0563" } };
+ yield return new object[] { engine, null, @"[X-a]+", "wWAXYZaxyz", RegexOptions.IgnoreCase, new string[] { "AXYZaxyz" } };
+ yield return new object[] { engine, null, @"[X-c]+", "wWABCXYZabcxyz", RegexOptions.IgnoreCase, new string[] { "ABCXYZabcxyz" } };
+ yield return new object[] { engine, null, @"[X-\u00C0]+", "\u00C1\u00E1\u00C0\u00E0wWABCXYZabcxyz", RegexOptions.IgnoreCase, new string[] { "\u00C0\u00E0wWABCXYZabcxyz" } };
+ yield return new object[] { engine, null, @"[\u0100\u0102\u0104]+", "\u00FF \u0100\u0102\u0104\u0101\u0103\u0105\u0106", RegexOptions.IgnoreCase, new string[] { "\u0100\u0102\u0104\u0101\u0103\u0105" } };
+ yield return new object[] { engine, null, @"[B-D\u0130]+", "aAeE\u0129\u0131\u0068 BCDbcD\u0130\u0069\u0070", RegexOptions.IgnoreCase, new string[] { "BCDbcD\u0130\u0069" } };
+ yield return new object[] { engine, null, @"[\u013B\u013D\u013F]+", "\u013A\u013B\u013D\u013F\u013C\u013E\u0140\u0141", RegexOptions.IgnoreCase, new string[] { "\u013B\u013D\u013F\u013C\u013E\u0140" } };
+
+ // Escape Chars
+ yield return new object[] { engine, null, "(Cat)\r(Dog)", "Cat\rDog", RegexOptions.None, new string[] { "Cat\rDog", "Cat", "Dog" } };
+ yield return new object[] { engine, null, "(Cat)\t(Dog)", "Cat\tDog", RegexOptions.None, new string[] { "Cat\tDog", "Cat", "Dog" } };
+ yield return new object[] { engine, null, "(Cat)\f(Dog)", "Cat\fDog", RegexOptions.None, new string[] { "Cat\fDog", "Cat", "Dog" } };
+
+ // Miscellaneous { witout matching }
+ yield return new object[] { engine, null, @"{5", "hello {5 world", RegexOptions.None, new string[] { "{5" } };
+ yield return new object[] { engine, null, @"{5,", "hello {5, world", RegexOptions.None, new string[] { "{5," } };
+ yield return new object[] { engine, null, @"{5,6", "hello {5,6 world", RegexOptions.None, new string[] { "{5,6" } };
+
+ // Miscellaneous inline options
+ yield return new object[] { engine, null, @"(?n:(?<cat>cat)(\s+)(?<dog>dog))", "cat dog", RegexOptions.None, new string[] { "cat dog", "cat", "dog" } };
+ yield return new object[] { engine, null, @"(?n:(cat)(\s+)(dog))", "cat dog", RegexOptions.None, new string[] { "cat dog" } };
+ yield return new object[] { engine, null, @"(?n:(cat)(?<SpaceChars>\s+)(dog))", "cat dog", RegexOptions.None, new string[] { "cat dog", " " } };
+ yield return new object[] { engine, null, @"(?x:
(?<cat>cat) # Cat statement
(\s+) # Whitespace chars
(?<dog>dog # Dog statement
))", "cat dog", RegexOptions.None, new string[] { "cat dog", " ", "cat", "dog" } };
- yield return new object[] { null, @"(?+i:cat)", "CAT", RegexOptions.None, new string[] { "CAT" } };
-
- // \d, \D, \s, \S, \w, \W, \P, \p inside character range
- yield return new object[] { null, @"cat([\d]*)dog", "hello123cat230927dog1412d", RegexOptions.None, new string[] { "cat230927dog", "230927" } };
- yield return new object[] { null, @"([\D]*)dog", "65498catdog58719", RegexOptions.None, new string[] { "catdog", "cat" } };
- yield return new object[] { null, @"cat([\s]*)dog", "wiocat dog3270", RegexOptions.None, new string[] { "cat dog", " " } };
- yield return new object[] { null, @"cat([\S]*)", "sfdcatdog 3270", RegexOptions.None, new string[] { "catdog", "dog" } };
- yield return new object[] { null, @"cat([\w]*)", "sfdcatdog 3270", RegexOptions.None, new string[] { "catdog", "dog" } };
- yield return new object[] { null, @"cat([\W]*)dog", "wiocat dog3270", RegexOptions.None, new string[] { "cat dog", " " } };
- yield return new object[] { null, @"([\p{Lu}]\w*)\s([\p{Lu}]\w*)", "Hello World", RegexOptions.None, new string[] { "Hello World", "Hello", "World" } };
- yield return new object[] { null, @"([\P{Ll}][\p{Ll}]*)\s([\P{Ll}][\p{Ll}]*)", "Hello World", RegexOptions.None, new string[] { "Hello World", "Hello", "World" } };
-
- // \x, \u, \a, \b, \e, \f, \n, \r, \t, \v, \c, inside character range
- yield return new object[] { null, @"(cat)([\x41]*)(dog)", "catAAAdog", RegexOptions.None, new string[] { "catAAAdog", "cat", "AAA", "dog" } };
- yield return new object[] { null, @"(cat)([\u0041]*)(dog)", "catAAAdog", RegexOptions.None, new string[] { "catAAAdog", "cat", "AAA", "dog" } };
- yield return new object[] { null, @"(cat)([\a]*)(dog)", "cat\a\a\adog", RegexOptions.None, new string[] { "cat\a\a\adog", "cat", "\a\a\a", "dog" } };
- yield return new object[] { null, @"(cat)([\b]*)(dog)", "cat\b\b\bdog", RegexOptions.None, new string[] { "cat\b\b\bdog", "cat", "\b\b\b", "dog" } };
- yield return new object[] { null, @"(cat)([\e]*)(dog)", "cat\u001B\u001B\u001Bdog", RegexOptions.None, new string[] { "cat\u001B\u001B\u001Bdog", "cat", "\u001B\u001B\u001B", "dog" } };
- yield return new object[] { null, @"(cat)([\f]*)(dog)", "cat\f\f\fdog", RegexOptions.None, new string[] { "cat\f\f\fdog", "cat", "\f\f\f", "dog" } };
- yield return new object[] { null, @"(cat)([\r]*)(dog)", "cat\r\r\rdog", RegexOptions.None, new string[] { "cat\r\r\rdog", "cat", "\r\r\r", "dog" } };
- yield return new object[] { null, @"(cat)([\v]*)(dog)", "cat\v\v\vdog", RegexOptions.None, new string[] { "cat\v\v\vdog", "cat", "\v\v\v", "dog" } };
-
- // \d, \D, \s, \S, \w, \W, \P, \p inside character range ([0-5]) with ECMA Option
- yield return new object[] { null, @"cat([\d]*)dog", "hello123cat230927dog1412d", RegexOptions.ECMAScript, new string[] { "cat230927dog", "230927" } };
- yield return new object[] { null, @"([\D]*)dog", "65498catdog58719", RegexOptions.ECMAScript, new string[] { "catdog", "cat" } };
- yield return new object[] { null, @"cat([\s]*)dog", "wiocat dog3270", RegexOptions.ECMAScript, new string[] { "cat dog", " " } };
- yield return new object[] { null, @"cat([\S]*)", "sfdcatdog 3270", RegexOptions.ECMAScript, new string[] { "catdog", "dog" } };
- yield return new object[] { null, @"cat([\w]*)", "sfdcatdog 3270", RegexOptions.ECMAScript, new string[] { "catdog", "dog" } };
- yield return new object[] { null, @"cat([\W]*)dog", "wiocat dog3270", RegexOptions.ECMAScript, new string[] { "cat dog", " " } };
- yield return new object[] { null, @"([\p{Lu}]\w*)\s([\p{Lu}]\w*)", "Hello World", RegexOptions.ECMAScript, new string[] { "Hello World", "Hello", "World" } };
- yield return new object[] { null, @"([\P{Ll}][\p{Ll}]*)\s([\P{Ll}][\p{Ll}]*)", "Hello World", RegexOptions.ECMAScript, new string[] { "Hello World", "Hello", "World" } };
-
- // \d, \D, \s, \S, \w, \W, \P, \p outside character range ([0-5]) with ECMA Option
- yield return new object[] { null, @"(cat)\d*dog", "hello123cat230927dog1412d", RegexOptions.ECMAScript, new string[] { "cat230927dog", "cat" } };
- yield return new object[] { null, @"\D*(dog)", "65498catdog58719", RegexOptions.ECMAScript, new string[] { "catdog", "dog" } };
- yield return new object[] { null, @"(cat)\s*(dog)", "wiocat dog3270", RegexOptions.ECMAScript, new string[] { "cat dog", "cat", "dog" } };
- yield return new object[] { null, @"(cat)\S*", "sfdcatdog 3270", RegexOptions.ECMAScript, new string[] { "catdog", "cat" } };
- yield return new object[] { null, @"(cat)\w*", "sfdcatdog 3270", RegexOptions.ECMAScript, new string[] { "catdog", "cat" } };
- yield return new object[] { null, @"(cat)\W*(dog)", "wiocat dog3270", RegexOptions.ECMAScript, new string[] { "cat dog", "cat", "dog" } };
- yield return new object[] { null, @"\p{Lu}(\w*)\s\p{Lu}(\w*)", "Hello World", RegexOptions.ECMAScript, new string[] { "Hello World", "ello", "orld" } };
- yield return new object[] { null, @"\P{Ll}\p{Ll}*\s\P{Ll}\p{Ll}*", "Hello World", RegexOptions.ECMAScript, new string[] { "Hello World" } };
-
- // Use < in a group
- yield return new object[] { null, @"cat(?<dog121>dog)", "catcatdogdogcat", RegexOptions.None, new string[] { "catdog", "dog" } };
- yield return new object[] { null, @"(?<cat>cat)\s*(?<cat>dog)", "catcat dogdogcat", RegexOptions.None, new string[] { "cat dog", "dog" } };
- yield return new object[] { null, @"(?<1>cat)\s*(?<1>dog)", "catcat dogdogcat", RegexOptions.None, new string[] { "cat dog", "dog" } };
- yield return new object[] { null, @"(?<2048>cat)\s*(?<2048>dog)", "catcat dogdogcat", RegexOptions.None, new string[] { "cat dog", "dog" } };
- yield return new object[] { null, @"(?<cat>cat)\w+(?<dog-cat>dog)", "cat_Hello_World_dog", RegexOptions.None, new string[] { "cat_Hello_World_dog", "", "_Hello_World_" } };
- yield return new object[] { null, @"(?<cat>cat)\w+(?<-cat>dog)", "cat_Hello_World_dog", RegexOptions.None, new string[] { "cat_Hello_World_dog", "" } };
- yield return new object[] { null, @"(?<cat>cat)\w+(?<cat-cat>dog)", "cat_Hello_World_dog", RegexOptions.None, new string[] { "cat_Hello_World_dog", "_Hello_World_" } };
- yield return new object[] { null, @"(?<1>cat)\w+(?<dog-1>dog)", "cat_Hello_World_dog", RegexOptions.None, new string[] { "cat_Hello_World_dog", "", "_Hello_World_" } };
- yield return new object[] { null, @"(?<cat>cat)\w+(?<2-cat>dog)", "cat_Hello_World_dog", RegexOptions.None, new string[] { "cat_Hello_World_dog", "", "_Hello_World_" } };
- yield return new object[] { null, @"(?<1>cat)\w+(?<2-1>dog)", "cat_Hello_World_dog", RegexOptions.None, new string[] { "cat_Hello_World_dog", "", "_Hello_World_" } };
-
- // Quantifiers
- yield return new object[] { null, @"(?<cat>cat){", "STARTcat{", RegexOptions.None, new string[] { "cat{", "cat" } };
- yield return new object[] { null, @"(?<cat>cat){fdsa", "STARTcat{fdsa", RegexOptions.None, new string[] { "cat{fdsa", "cat" } };
- yield return new object[] { null, @"(?<cat>cat){1", "STARTcat{1", RegexOptions.None, new string[] { "cat{1", "cat" } };
- yield return new object[] { null, @"(?<cat>cat){1END", "STARTcat{1END", RegexOptions.None, new string[] { "cat{1END", "cat" } };
- yield return new object[] { null, @"(?<cat>cat){1,", "STARTcat{1,", RegexOptions.None, new string[] { "cat{1,", "cat" } };
- yield return new object[] { null, @"(?<cat>cat){1,END", "STARTcat{1,END", RegexOptions.None, new string[] { "cat{1,END", "cat" } };
- yield return new object[] { null, @"(?<cat>cat){1,2", "STARTcat{1,2", RegexOptions.None, new string[] { "cat{1,2", "cat" } };
- yield return new object[] { null, @"(?<cat>cat){1,2END", "STARTcat{1,2END", RegexOptions.None, new string[] { "cat{1,2END", "cat" } };
-
- // Use IgnorePatternWhitespace
- yield return new object[] { null, @"(cat) #cat
+ yield return new object[] { engine, null, @"(?+i:cat)", "CAT", RegexOptions.None, new string[] { "CAT" } };
+
+ // \d, \D, \s, \S, \w, \W, \P, \p inside character range
+ yield return new object[] { engine, null, @"cat([\d]*)dog", "hello123cat230927dog1412d", RegexOptions.None, new string[] { "cat230927dog", "230927" } };
+ yield return new object[] { engine, null, @"([\D]*)dog", "65498catdog58719", RegexOptions.None, new string[] { "catdog", "cat" } };
+ yield return new object[] { engine, null, @"cat([\s]*)dog", "wiocat dog3270", RegexOptions.None, new string[] { "cat dog", " " } };
+ yield return new object[] { engine, null, @"cat([\S]*)", "sfdcatdog 3270", RegexOptions.None, new string[] { "catdog", "dog" } };
+ yield return new object[] { engine, null, @"cat([\w]*)", "sfdcatdog 3270", RegexOptions.None, new string[] { "catdog", "dog" } };
+ yield return new object[] { engine, null, @"cat([\W]*)dog", "wiocat dog3270", RegexOptions.None, new string[] { "cat dog", " " } };
+ yield return new object[] { engine, null, @"([\p{Lu}]\w*)\s([\p{Lu}]\w*)", "Hello World", RegexOptions.None, new string[] { "Hello World", "Hello", "World" } };
+ yield return new object[] { engine, null, @"([\P{Ll}][\p{Ll}]*)\s([\P{Ll}][\p{Ll}]*)", "Hello World", RegexOptions.None, new string[] { "Hello World", "Hello", "World" } };
+
+ // \x, \u, \a, \b, \e, \f, \n, \r, \t, \v, \c, inside character range
+ yield return new object[] { engine, null, @"(cat)([\x41]*)(dog)", "catAAAdog", RegexOptions.None, new string[] { "catAAAdog", "cat", "AAA", "dog" } };
+ yield return new object[] { engine, null, @"(cat)([\u0041]*)(dog)", "catAAAdog", RegexOptions.None, new string[] { "catAAAdog", "cat", "AAA", "dog" } };
+ yield return new object[] { engine, null, @"(cat)([\a]*)(dog)", "cat\a\a\adog", RegexOptions.None, new string[] { "cat\a\a\adog", "cat", "\a\a\a", "dog" } };
+ yield return new object[] { engine, null, @"(cat)([\b]*)(dog)", "cat\b\b\bdog", RegexOptions.None, new string[] { "cat\b\b\bdog", "cat", "\b\b\b", "dog" } };
+ yield return new object[] { engine, null, @"(cat)([\e]*)(dog)", "cat\u001B\u001B\u001Bdog", RegexOptions.None, new string[] { "cat\u001B\u001B\u001Bdog", "cat", "\u001B\u001B\u001B", "dog" } };
+ yield return new object[] { engine, null, @"(cat)([\f]*)(dog)", "cat\f\f\fdog", RegexOptions.None, new string[] { "cat\f\f\fdog", "cat", "\f\f\f", "dog" } };
+ yield return new object[] { engine, null, @"(cat)([\r]*)(dog)", "cat\r\r\rdog", RegexOptions.None, new string[] { "cat\r\r\rdog", "cat", "\r\r\r", "dog" } };
+ yield return new object[] { engine, null, @"(cat)([\v]*)(dog)", "cat\v\v\vdog", RegexOptions.None, new string[] { "cat\v\v\vdog", "cat", "\v\v\v", "dog" } };
+
+ // \d, \D, \s, \S, \w, \W, \P, \p inside character range ([0-5]) with ECMA Option
+ yield return new object[] { engine, null, @"cat([\d]*)dog", "hello123cat230927dog1412d", RegexOptions.ECMAScript, new string[] { "cat230927dog", "230927" } };
+ yield return new object[] { engine, null, @"([\D]*)dog", "65498catdog58719", RegexOptions.ECMAScript, new string[] { "catdog", "cat" } };
+ yield return new object[] { engine, null, @"cat([\s]*)dog", "wiocat dog3270", RegexOptions.ECMAScript, new string[] { "cat dog", " " } };
+ yield return new object[] { engine, null, @"cat([\S]*)", "sfdcatdog 3270", RegexOptions.ECMAScript, new string[] { "catdog", "dog" } };
+ yield return new object[] { engine, null, @"cat([\w]*)", "sfdcatdog 3270", RegexOptions.ECMAScript, new string[] { "catdog", "dog" } };
+ yield return new object[] { engine, null, @"cat([\W]*)dog", "wiocat dog3270", RegexOptions.ECMAScript, new string[] { "cat dog", " " } };
+ yield return new object[] { engine, null, @"([\p{Lu}]\w*)\s([\p{Lu}]\w*)", "Hello World", RegexOptions.ECMAScript, new string[] { "Hello World", "Hello", "World" } };
+ yield return new object[] { engine, null, @"([\P{Ll}][\p{Ll}]*)\s([\P{Ll}][\p{Ll}]*)", "Hello World", RegexOptions.ECMAScript, new string[] { "Hello World", "Hello", "World" } };
+
+ // \d, \D, \s, \S, \w, \W, \P, \p outside character range ([0-5]) with ECMA Option
+ yield return new object[] { engine, null, @"(cat)\d*dog", "hello123cat230927dog1412d", RegexOptions.ECMAScript, new string[] { "cat230927dog", "cat" } };
+ yield return new object[] { engine, null, @"\D*(dog)", "65498catdog58719", RegexOptions.ECMAScript, new string[] { "catdog", "dog" } };
+ yield return new object[] { engine, null, @"(cat)\s*(dog)", "wiocat dog3270", RegexOptions.ECMAScript, new string[] { "cat dog", "cat", "dog" } };
+ yield return new object[] { engine, null, @"(cat)\S*", "sfdcatdog 3270", RegexOptions.ECMAScript, new string[] { "catdog", "cat" } };
+ yield return new object[] { engine, null, @"(cat)\w*", "sfdcatdog 3270", RegexOptions.ECMAScript, new string[] { "catdog", "cat" } };
+ yield return new object[] { engine, null, @"(cat)\W*(dog)", "wiocat dog3270", RegexOptions.ECMAScript, new string[] { "cat dog", "cat", "dog" } };
+ yield return new object[] { engine, null, @"\p{Lu}(\w*)\s\p{Lu}(\w*)", "Hello World", RegexOptions.ECMAScript, new string[] { "Hello World", "ello", "orld" } };
+ yield return new object[] { engine, null, @"\P{Ll}\p{Ll}*\s\P{Ll}\p{Ll}*", "Hello World", RegexOptions.ECMAScript, new string[] { "Hello World" } };
+
+ // Use < in a group
+ yield return new object[] { engine, null, @"cat(?<dog121>dog)", "catcatdogdogcat", RegexOptions.None, new string[] { "catdog", "dog" } };
+ yield return new object[] { engine, null, @"(?<cat>cat)\s*(?<cat>dog)", "catcat dogdogcat", RegexOptions.None, new string[] { "cat dog", "dog" } };
+ yield return new object[] { engine, null, @"(?<1>cat)\s*(?<1>dog)", "catcat dogdogcat", RegexOptions.None, new string[] { "cat dog", "dog" } };
+ yield return new object[] { engine, null, @"(?<2048>cat)\s*(?<2048>dog)", "catcat dogdogcat", RegexOptions.None, new string[] { "cat dog", "dog" } };
+ yield return new object[] { engine, null, @"(?<cat>cat)\w+(?<dog-cat>dog)", "cat_Hello_World_dog", RegexOptions.None, new string[] { "cat_Hello_World_dog", "", "_Hello_World_" } };
+ yield return new object[] { engine, null, @"(?<cat>cat)\w+(?<-cat>dog)", "cat_Hello_World_dog", RegexOptions.None, new string[] { "cat_Hello_World_dog", "" } };
+ yield return new object[] { engine, null, @"(?<cat>cat)\w+(?<cat-cat>dog)", "cat_Hello_World_dog", RegexOptions.None, new string[] { "cat_Hello_World_dog", "_Hello_World_" } };
+ yield return new object[] { engine, null, @"(?<1>cat)\w+(?<dog-1>dog)", "cat_Hello_World_dog", RegexOptions.None, new string[] { "cat_Hello_World_dog", "", "_Hello_World_" } };
+ yield return new object[] { engine, null, @"(?<cat>cat)\w+(?<2-cat>dog)", "cat_Hello_World_dog", RegexOptions.None, new string[] { "cat_Hello_World_dog", "", "_Hello_World_" } };
+ yield return new object[] { engine, null, @"(?<1>cat)\w+(?<2-1>dog)", "cat_Hello_World_dog", RegexOptions.None, new string[] { "cat_Hello_World_dog", "", "_Hello_World_" } };
+
+ // Quantifiers
+ yield return new object[] { engine, null, @"(?<cat>cat){", "STARTcat{", RegexOptions.None, new string[] { "cat{", "cat" } };
+ yield return new object[] { engine, null, @"(?<cat>cat){fdsa", "STARTcat{fdsa", RegexOptions.None, new string[] { "cat{fdsa", "cat" } };
+ yield return new object[] { engine, null, @"(?<cat>cat){1", "STARTcat{1", RegexOptions.None, new string[] { "cat{1", "cat" } };
+ yield return new object[] { engine, null, @"(?<cat>cat){1END", "STARTcat{1END", RegexOptions.None, new string[] { "cat{1END", "cat" } };
+ yield return new object[] { engine, null, @"(?<cat>cat){1,", "STARTcat{1,", RegexOptions.None, new string[] { "cat{1,", "cat" } };
+ yield return new object[] { engine, null, @"(?<cat>cat){1,END", "STARTcat{1,END", RegexOptions.None, new string[] { "cat{1,END", "cat" } };
+ yield return new object[] { engine, null, @"(?<cat>cat){1,2", "STARTcat{1,2", RegexOptions.None, new string[] { "cat{1,2", "cat" } };
+ yield return new object[] { engine, null, @"(?<cat>cat){1,2END", "STARTcat{1,2END", RegexOptions.None, new string[] { "cat{1,2END", "cat" } };
+
+ // Use IgnorePatternWhitespace
+ yield return new object[] { engine, null, @"(cat) #cat
\s+ #followed by 1 or more whitespace
(dog) #followed by dog
", "cat dog", RegexOptions.IgnorePatternWhitespace, new string[] { "cat dog", "cat", "dog" } };
- yield return new object[] { null, @"(cat) #cat
+ yield return new object[] { engine, null, @"(cat) #cat
\s+ #followed by 1 or more whitespace
(dog) #followed by dog", "cat dog", RegexOptions.IgnorePatternWhitespace, new string[] { "cat dog", "cat", "dog" } };
- yield return new object[] { null, @"(cat) (?#cat) \s+ (?#followed by 1 or more whitespace) (dog) (?#followed by dog)", "cat dog", RegexOptions.IgnorePatternWhitespace, new string[] { "cat dog", "cat", "dog" } };
-
- // Back Reference
- yield return new object[] { null, @"(?<cat>cat)(?<dog>dog)\k<cat>", "asdfcatdogcatdog", RegexOptions.None, new string[] { "catdogcat", "cat", "dog" } };
- yield return new object[] { null, @"(?<cat>cat)\s+(?<dog>dog)\k<cat>", "asdfcat dogcat dog", RegexOptions.None, new string[] { "cat dogcat", "cat", "dog" } };
- yield return new object[] { null, @"(?<cat>cat)\s+(?<dog>dog)\k'cat'", "asdfcat dogcat dog", RegexOptions.None, new string[] { "cat dogcat", "cat", "dog" } };
- yield return new object[] { null, @"(?<cat>cat)\s+(?<dog>dog)\<cat>", "asdfcat dogcat dog", RegexOptions.None, new string[] { "cat dogcat", "cat", "dog" } };
- yield return new object[] { null, @"(?<cat>cat)\s+(?<dog>dog)\'cat'", "asdfcat dogcat dog", RegexOptions.None, new string[] { "cat dogcat", "cat", "dog" } };
-
- yield return new object[] { null, @"(?<cat>cat)\s+(?<dog>dog)\k<1>", "asdfcat dogcat dog", RegexOptions.None, new string[] { "cat dogcat", "cat", "dog" } };
- yield return new object[] { null, @"(?<cat>cat)\s+(?<dog>dog)\k'1'", "asdfcat dogcat dog", RegexOptions.None, new string[] { "cat dogcat", "cat", "dog" } };
- yield return new object[] { null, @"(?<cat>cat)\s+(?<dog>dog)\<1>", "asdfcat dogcat dog", RegexOptions.None, new string[] { "cat dogcat", "cat", "dog" } };
- yield return new object[] { null, @"(?<cat>cat)\s+(?<dog>dog)\'1'", "asdfcat dogcat dog", RegexOptions.None, new string[] { "cat dogcat", "cat", "dog" } };
- yield return new object[] { null, @"(?<cat>cat)\s+(?<dog>dog)\1", "asdfcat dogcat dog", RegexOptions.None, new string[] { "cat dogcat", "cat", "dog" } };
- yield return new object[] { null, @"(?<cat>cat)\s+(?<dog>dog)\1", "asdfcat dogcat dog", RegexOptions.ECMAScript, new string[] { "cat dogcat", "cat", "dog" } };
-
- yield return new object[] { null, @"(?<cat>cat)\s+(?<dog>dog)\k<dog>", "asdfcat dogdog dog", RegexOptions.None, new string[] { "cat dogdog", "cat", "dog" } };
- yield return new object[] { null, @"(?<cat>cat)\s+(?<dog>dog)\2", "asdfcat dogdog dog", RegexOptions.None, new string[] { "cat dogdog", "cat", "dog" } };
- yield return new object[] { null, @"(?<cat>cat)\s+(?<dog>dog)\2", "asdfcat dogdog dog", RegexOptions.ECMAScript, new string[] { "cat dogdog", "cat", "dog" } };
-
- // Octal
- yield return new object[] { null, @"(cat)(\077)", "hellocat?dogworld", RegexOptions.None, new string[] { "cat?", "cat", "?" } };
- yield return new object[] { null, @"(cat)(\77)", "hellocat?dogworld", RegexOptions.None, new string[] { "cat?", "cat", "?" } };
- yield return new object[] { null, @"(cat)(\176)", "hellocat~dogworld", RegexOptions.None, new string[] { "cat~", "cat", "~" } };
- yield return new object[] { null, @"(cat)(\400)", "hellocat\0dogworld", RegexOptions.None, new string[] { "cat\0", "cat", "\0" } };
- yield return new object[] { null, @"(cat)(\300)", "hellocat\u00C0dogworld", RegexOptions.None, new string[] { "cat\u00C0", "cat", "\u00C0" } };
- yield return new object[] { null, @"(cat)(\477)", "hellocat\u003Fdogworld", RegexOptions.None, new string[] { "cat\u003F", "cat", "\u003F" } };
- yield return new object[] { null, @"(cat)(\777)", "hellocat\u00FFdogworld", RegexOptions.None, new string[] { "cat\u00FF", "cat", "\u00FF" } };
- yield return new object[] { null, @"(cat)(\7770)", "hellocat\u00FF0dogworld", RegexOptions.None, new string[] { "cat\u00FF0", "cat", "\u00FF0" } };
-
- yield return new object[] { null, @"(cat)(\077)", "hellocat?dogworld", RegexOptions.ECMAScript, new string[] { "cat?", "cat", "?" } };
- yield return new object[] { null, @"(cat)(\77)", "hellocat?dogworld", RegexOptions.ECMAScript, new string[] { "cat?", "cat", "?" } };
- yield return new object[] { null, @"(cat)(\7)", "hellocat\adogworld", RegexOptions.ECMAScript, new string[] { "cat\a", "cat", "\a" } };
- yield return new object[] { null, @"(cat)(\40)", "hellocat dogworld", RegexOptions.ECMAScript, new string[] { "cat ", "cat", " " } };
- yield return new object[] { null, @"(cat)(\040)", "hellocat dogworld", RegexOptions.ECMAScript, new string[] { "cat ", "cat", " " } };
- yield return new object[] { null, @"(cat)(\176)", "hellocatcat76dogworld", RegexOptions.ECMAScript, new string[] { "catcat76", "cat", "cat76" } };
- yield return new object[] { null, @"(cat)(\377)", "hellocat\u00FFdogworld", RegexOptions.ECMAScript, new string[] { "cat\u00FF", "cat", "\u00FF" } };
- yield return new object[] { null, @"(cat)(\400)", "hellocat 0Fdogworld", RegexOptions.ECMAScript, new string[] { "cat 0", "cat", " 0" } };
-
- // Decimal
- yield return new object[] { null, @"(cat)\s+(?<2147483646>dog)", "asdlkcat dogiwod", RegexOptions.None, new string[] { "cat dog", "cat", "dog" } };
- yield return new object[] { null, @"(cat)\s+(?<2147483647>dog)", "asdlkcat dogiwod", RegexOptions.None, new string[] { "cat dog", "cat", "dog" } };
-
- // Hex
- yield return new object[] { null, @"(cat)(\x2a*)(dog)", "asdlkcat***dogiwod", RegexOptions.None, new string[] { "cat***dog", "cat", "***", "dog" } };
- yield return new object[] { null, @"(cat)(\x2b*)(dog)", "asdlkcat+++dogiwod", RegexOptions.None, new string[] { "cat+++dog", "cat", "+++", "dog" } };
- yield return new object[] { null, @"(cat)(\x2c*)(dog)", "asdlkcat,,,dogiwod", RegexOptions.None, new string[] { "cat,,,dog", "cat", ",,,", "dog" } };
- yield return new object[] { null, @"(cat)(\x2d*)(dog)", "asdlkcat---dogiwod", RegexOptions.None, new string[] { "cat---dog", "cat", "---", "dog" } };
- yield return new object[] { null, @"(cat)(\x2e*)(dog)", "asdlkcat...dogiwod", RegexOptions.None, new string[] { "cat...dog", "cat", "...", "dog" } };
- yield return new object[] { null, @"(cat)(\x2f*)(dog)", "asdlkcat///dogiwod", RegexOptions.None, new string[] { "cat///dog", "cat", "///", "dog" } };
-
- yield return new object[] { null, @"(cat)(\x2A*)(dog)", "asdlkcat***dogiwod", RegexOptions.None, new string[] { "cat***dog", "cat", "***", "dog" } };
- yield return new object[] { null, @"(cat)(\x2B*)(dog)", "asdlkcat+++dogiwod", RegexOptions.None, new string[] { "cat+++dog", "cat", "+++", "dog" } };
- yield return new object[] { null, @"(cat)(\x2C*)(dog)", "asdlkcat,,,dogiwod", RegexOptions.None, new string[] { "cat,,,dog", "cat", ",,,", "dog" } };
- yield return new object[] { null, @"(cat)(\x2D*)(dog)", "asdlkcat---dogiwod", RegexOptions.None, new string[] { "cat---dog", "cat", "---", "dog" } };
- yield return new object[] { null, @"(cat)(\x2E*)(dog)", "asdlkcat...dogiwod", RegexOptions.None, new string[] { "cat...dog", "cat", "...", "dog" } };
- yield return new object[] { null, @"(cat)(\x2F*)(dog)", "asdlkcat///dogiwod", RegexOptions.None, new string[] { "cat///dog", "cat", "///", "dog" } };
-
- // ScanControl
- yield return new object[] { null, @"(cat)(\c@*)(dog)", "asdlkcat\0\0dogiwod", RegexOptions.None, new string[] { "cat\0\0dog", "cat", "\0\0", "dog" } };
- yield return new object[] { null, @"(cat)(\cA*)(dog)", "asdlkcat\u0001dogiwod", RegexOptions.None, new string[] { "cat\u0001dog", "cat", "\u0001", "dog" } };
- yield return new object[] { null, @"(cat)(\ca*)(dog)", "asdlkcat\u0001dogiwod", RegexOptions.None, new string[] { "cat\u0001dog", "cat", "\u0001", "dog" } };
-
- yield return new object[] { null, @"(cat)(\cC*)(dog)", "asdlkcat\u0003dogiwod", RegexOptions.None, new string[] { "cat\u0003dog", "cat", "\u0003", "dog" } };
- yield return new object[] { null, @"(cat)(\cc*)(dog)", "asdlkcat\u0003dogiwod", RegexOptions.None, new string[] { "cat\u0003dog", "cat", "\u0003", "dog" } };
-
- yield return new object[] { null, @"(cat)(\cD*)(dog)", "asdlkcat\u0004dogiwod", RegexOptions.None, new string[] { "cat\u0004dog", "cat", "\u0004", "dog" } };
- yield return new object[] { null, @"(cat)(\cd*)(dog)", "asdlkcat\u0004dogiwod", RegexOptions.None, new string[] { "cat\u0004dog", "cat", "\u0004", "dog" } };
-
- yield return new object[] { null, @"(cat)(\cX*)(dog)", "asdlkcat\u0018dogiwod", RegexOptions.None, new string[] { "cat\u0018dog", "cat", "\u0018", "dog" } };
- yield return new object[] { null, @"(cat)(\cx*)(dog)", "asdlkcat\u0018dogiwod", RegexOptions.None, new string[] { "cat\u0018dog", "cat", "\u0018", "dog" } };
-
- yield return new object[] { null, @"(cat)(\cZ*)(dog)", "asdlkcat\u001adogiwod", RegexOptions.None, new string[] { "cat\u001adog", "cat", "\u001a", "dog" } };
- yield return new object[] { null, @"(cat)(\cz*)(dog)", "asdlkcat\u001adogiwod", RegexOptions.None, new string[] { "cat\u001adog", "cat", "\u001a", "dog" } };
-
- if (!PlatformDetection.IsNetFramework) // missing fix for https://github.com/dotnet/runtime/issues/24759
- {
- yield return new object[] { null, @"(cat)(\c[*)(dog)", "asdlkcat\u001bdogiwod", RegexOptions.None, new string[] { "cat\u001bdog", "cat", "\u001b", "dog" } };
- }
+ yield return new object[] { engine, null, @"(cat) (?#cat) \s+ (?#followed by 1 or more whitespace) (dog) (?#followed by dog)", "cat dog", RegexOptions.IgnorePatternWhitespace, new string[] { "cat dog", "cat", "dog" } };
+
+ // Back Reference
+ yield return new object[] { engine, null, @"(?<cat>cat)(?<dog>dog)\k<cat>", "asdfcatdogcatdog", RegexOptions.None, new string[] { "catdogcat", "cat", "dog" } };
+ yield return new object[] { engine, null, @"(?<cat>cat)\s+(?<dog>dog)\k<cat>", "asdfcat dogcat dog", RegexOptions.None, new string[] { "cat dogcat", "cat", "dog" } };
+ yield return new object[] { engine, null, @"(?<cat>cat)\s+(?<dog>dog)\k'cat'", "asdfcat dogcat dog", RegexOptions.None, new string[] { "cat dogcat", "cat", "dog" } };
+ yield return new object[] { engine, null, @"(?<cat>cat)\s+(?<dog>dog)\<cat>", "asdfcat dogcat dog", RegexOptions.None, new string[] { "cat dogcat", "cat", "dog" } };
+ yield return new object[] { engine, null, @"(?<cat>cat)\s+(?<dog>dog)\'cat'", "asdfcat dogcat dog", RegexOptions.None, new string[] { "cat dogcat", "cat", "dog" } };
+
+ yield return new object[] { engine, null, @"(?<cat>cat)\s+(?<dog>dog)\k<1>", "asdfcat dogcat dog", RegexOptions.None, new string[] { "cat dogcat", "cat", "dog" } };
+ yield return new object[] { engine, null, @"(?<cat>cat)\s+(?<dog>dog)\k'1'", "asdfcat dogcat dog", RegexOptions.None, new string[] { "cat dogcat", "cat", "dog" } };
+ yield return new object[] { engine, null, @"(?<cat>cat)\s+(?<dog>dog)\<1>", "asdfcat dogcat dog", RegexOptions.None, new string[] { "cat dogcat", "cat", "dog" } };
+ yield return new object[] { engine, null, @"(?<cat>cat)\s+(?<dog>dog)\'1'", "asdfcat dogcat dog", RegexOptions.None, new string[] { "cat dogcat", "cat", "dog" } };
+ yield return new object[] { engine, null, @"(?<cat>cat)\s+(?<dog>dog)\1", "asdfcat dogcat dog", RegexOptions.None, new string[] { "cat dogcat", "cat", "dog" } };
+ yield return new object[] { engine, null, @"(?<cat>cat)\s+(?<dog>dog)\1", "asdfcat dogcat dog", RegexOptions.ECMAScript, new string[] { "cat dogcat", "cat", "dog" } };
+
+ yield return new object[] { engine, null, @"(?<cat>cat)\s+(?<dog>dog)\k<dog>", "asdfcat dogdog dog", RegexOptions.None, new string[] { "cat dogdog", "cat", "dog" } };
+ yield return new object[] { engine, null, @"(?<cat>cat)\s+(?<dog>dog)\2", "asdfcat dogdog dog", RegexOptions.None, new string[] { "cat dogdog", "cat", "dog" } };
+ yield return new object[] { engine, null, @"(?<cat>cat)\s+(?<dog>dog)\2", "asdfcat dogdog dog", RegexOptions.ECMAScript, new string[] { "cat dogdog", "cat", "dog" } };
+
+ // Octal
+ yield return new object[] { engine, null, @"(cat)(\077)", "hellocat?dogworld", RegexOptions.None, new string[] { "cat?", "cat", "?" } };
+ yield return new object[] { engine, null, @"(cat)(\77)", "hellocat?dogworld", RegexOptions.None, new string[] { "cat?", "cat", "?" } };
+ yield return new object[] { engine, null, @"(cat)(\176)", "hellocat~dogworld", RegexOptions.None, new string[] { "cat~", "cat", "~" } };
+ yield return new object[] { engine, null, @"(cat)(\400)", "hellocat\0dogworld", RegexOptions.None, new string[] { "cat\0", "cat", "\0" } };
+ yield return new object[] { engine, null, @"(cat)(\300)", "hellocat\u00C0dogworld", RegexOptions.None, new string[] { "cat\u00C0", "cat", "\u00C0" } };
+ yield return new object[] { engine, null, @"(cat)(\477)", "hellocat\u003Fdogworld", RegexOptions.None, new string[] { "cat\u003F", "cat", "\u003F" } };
+ yield return new object[] { engine, null, @"(cat)(\777)", "hellocat\u00FFdogworld", RegexOptions.None, new string[] { "cat\u00FF", "cat", "\u00FF" } };
+ yield return new object[] { engine, null, @"(cat)(\7770)", "hellocat\u00FF0dogworld", RegexOptions.None, new string[] { "cat\u00FF0", "cat", "\u00FF0" } };
+
+ yield return new object[] { engine, null, @"(cat)(\077)", "hellocat?dogworld", RegexOptions.ECMAScript, new string[] { "cat?", "cat", "?" } };
+ yield return new object[] { engine, null, @"(cat)(\77)", "hellocat?dogworld", RegexOptions.ECMAScript, new string[] { "cat?", "cat", "?" } };
+ yield return new object[] { engine, null, @"(cat)(\7)", "hellocat\adogworld", RegexOptions.ECMAScript, new string[] { "cat\a", "cat", "\a" } };
+ yield return new object[] { engine, null, @"(cat)(\40)", "hellocat dogworld", RegexOptions.ECMAScript, new string[] { "cat ", "cat", " " } };
+ yield return new object[] { engine, null, @"(cat)(\040)", "hellocat dogworld", RegexOptions.ECMAScript, new string[] { "cat ", "cat", " " } };
+ yield return new object[] { engine, null, @"(cat)(\176)", "hellocatcat76dogworld", RegexOptions.ECMAScript, new string[] { "catcat76", "cat", "cat76" } };
+ yield return new object[] { engine, null, @"(cat)(\377)", "hellocat\u00FFdogworld", RegexOptions.ECMAScript, new string[] { "cat\u00FF", "cat", "\u00FF" } };
+ yield return new object[] { engine, null, @"(cat)(\400)", "hellocat 0Fdogworld", RegexOptions.ECMAScript, new string[] { "cat 0", "cat", " 0" } };
+
+ // Decimal
+ yield return new object[] { engine, null, @"(cat)\s+(?<2147483646>dog)", "asdlkcat dogiwod", RegexOptions.None, new string[] { "cat dog", "cat", "dog" } };
+ yield return new object[] { engine, null, @"(cat)\s+(?<2147483647>dog)", "asdlkcat dogiwod", RegexOptions.None, new string[] { "cat dog", "cat", "dog" } };
+
+ // Hex
+ yield return new object[] { engine, null, @"(cat)(\x2a*)(dog)", "asdlkcat***dogiwod", RegexOptions.None, new string[] { "cat***dog", "cat", "***", "dog" } };
+ yield return new object[] { engine, null, @"(cat)(\x2b*)(dog)", "asdlkcat+++dogiwod", RegexOptions.None, new string[] { "cat+++dog", "cat", "+++", "dog" } };
+ yield return new object[] { engine, null, @"(cat)(\x2c*)(dog)", "asdlkcat,,,dogiwod", RegexOptions.None, new string[] { "cat,,,dog", "cat", ",,,", "dog" } };
+ yield return new object[] { engine, null, @"(cat)(\x2d*)(dog)", "asdlkcat---dogiwod", RegexOptions.None, new string[] { "cat---dog", "cat", "---", "dog" } };
+ yield return new object[] { engine, null, @"(cat)(\x2e*)(dog)", "asdlkcat...dogiwod", RegexOptions.None, new string[] { "cat...dog", "cat", "...", "dog" } };
+ yield return new object[] { engine, null, @"(cat)(\x2f*)(dog)", "asdlkcat///dogiwod", RegexOptions.None, new string[] { "cat///dog", "cat", "///", "dog" } };
+
+ yield return new object[] { engine, null, @"(cat)(\x2A*)(dog)", "asdlkcat***dogiwod", RegexOptions.None, new string[] { "cat***dog", "cat", "***", "dog" } };
+ yield return new object[] { engine, null, @"(cat)(\x2B*)(dog)", "asdlkcat+++dogiwod", RegexOptions.None, new string[] { "cat+++dog", "cat", "+++", "dog" } };
+ yield return new object[] { engine, null, @"(cat)(\x2C*)(dog)", "asdlkcat,,,dogiwod", RegexOptions.None, new string[] { "cat,,,dog", "cat", ",,,", "dog" } };
+ yield return new object[] { engine, null, @"(cat)(\x2D*)(dog)", "asdlkcat---dogiwod", RegexOptions.None, new string[] { "cat---dog", "cat", "---", "dog" } };
+ yield return new object[] { engine, null, @"(cat)(\x2E*)(dog)", "asdlkcat...dogiwod", RegexOptions.None, new string[] { "cat...dog", "cat", "...", "dog" } };
+ yield return new object[] { engine, null, @"(cat)(\x2F*)(dog)", "asdlkcat///dogiwod", RegexOptions.None, new string[] { "cat///dog", "cat", "///", "dog" } };
+
+ // ScanControl
+ yield return new object[] { engine, null, @"(cat)(\c@*)(dog)", "asdlkcat\0\0dogiwod", RegexOptions.None, new string[] { "cat\0\0dog", "cat", "\0\0", "dog" } };
+ yield return new object[] { engine, null, @"(cat)(\cA*)(dog)", "asdlkcat\u0001dogiwod", RegexOptions.None, new string[] { "cat\u0001dog", "cat", "\u0001", "dog" } };
+ yield return new object[] { engine, null, @"(cat)(\ca*)(dog)", "asdlkcat\u0001dogiwod", RegexOptions.None, new string[] { "cat\u0001dog", "cat", "\u0001", "dog" } };
+
+ yield return new object[] { engine, null, @"(cat)(\cC*)(dog)", "asdlkcat\u0003dogiwod", RegexOptions.None, new string[] { "cat\u0003dog", "cat", "\u0003", "dog" } };
+ yield return new object[] { engine, null, @"(cat)(\cc*)(dog)", "asdlkcat\u0003dogiwod", RegexOptions.None, new string[] { "cat\u0003dog", "cat", "\u0003", "dog" } };
+
+ yield return new object[] { engine, null, @"(cat)(\cD*)(dog)", "asdlkcat\u0004dogiwod", RegexOptions.None, new string[] { "cat\u0004dog", "cat", "\u0004", "dog" } };
+ yield return new object[] { engine, null, @"(cat)(\cd*)(dog)", "asdlkcat\u0004dogiwod", RegexOptions.None, new string[] { "cat\u0004dog", "cat", "\u0004", "dog" } };
+
+ yield return new object[] { engine, null, @"(cat)(\cX*)(dog)", "asdlkcat\u0018dogiwod", RegexOptions.None, new string[] { "cat\u0018dog", "cat", "\u0018", "dog" } };
+ yield return new object[] { engine, null, @"(cat)(\cx*)(dog)", "asdlkcat\u0018dogiwod", RegexOptions.None, new string[] { "cat\u0018dog", "cat", "\u0018", "dog" } };
+
+ yield return new object[] { engine, null, @"(cat)(\cZ*)(dog)", "asdlkcat\u001adogiwod", RegexOptions.None, new string[] { "cat\u001adog", "cat", "\u001a", "dog" } };
+ yield return new object[] { engine, null, @"(cat)(\cz*)(dog)", "asdlkcat\u001adogiwod", RegexOptions.None, new string[] { "cat\u001adog", "cat", "\u001a", "dog" } };
+
+ if (!PlatformDetection.IsNetFramework) // missing fix for https://github.com/dotnet/runtime/issues/24759
+ {
+ yield return new object[] { engine, null, @"(cat)(\c[*)(dog)", "asdlkcat\u001bdogiwod", RegexOptions.None, new string[] { "cat\u001bdog", "cat", "\u001b", "dog" } };
+ }
- // Atomic Zero-Width Assertions \A \G ^ \Z \z \b \B
- //\A
- yield return new object[] { null, @"\Acat\s+dog", "cat \n\n\n dog", RegexOptions.None, new string[] { "cat \n\n\n dog" } };
- yield return new object[] { null, @"\Acat\s+dog", "cat \n\n\n dog", RegexOptions.Multiline, new string[] { "cat \n\n\n dog" } };
- yield return new object[] { null, @"\A(cat)\s+(dog)", "cat \n\n\n dog", RegexOptions.None, new string[] { "cat \n\n\n dog", "cat", "dog" } };
- yield return new object[] { null, @"\A(cat)\s+(dog)", "cat \n\n\n dog", RegexOptions.Multiline, new string[] { "cat \n\n\n dog", "cat", "dog" } };
-
- //\G
- yield return new object[] { null, @"\Gcat\s+dog", "cat \n\n\n dog", RegexOptions.None, new string[] { "cat \n\n\n dog" } };
- yield return new object[] { null, @"\Gcat\s+dog", "cat \n\n\n dog", RegexOptions.Multiline, new string[] { "cat \n\n\n dog" } };
- yield return new object[] { null, @"\Gcat\s+dog", "cat \n\n\n dog", RegexOptions.ECMAScript, new string[] { "cat \n\n\n dog" } };
- yield return new object[] { null, @"\G(cat)\s+(dog)", "cat \n\n\n dog", RegexOptions.None, new string[] { "cat \n\n\n dog", "cat", "dog" } };
- yield return new object[] { null, @"\G(cat)\s+(dog)", "cat \n\n\n dog", RegexOptions.Multiline, new string[] { "cat \n\n\n dog", "cat", "dog" } };
- yield return new object[] { null, @"\G(cat)\s+(dog)", "cat \n\n\n dog", RegexOptions.ECMAScript, new string[] { "cat \n\n\n dog", "cat", "dog" } };
-
- //^
- yield return new object[] { null, @"^cat\s+dog", "cat \n\n\n dog", RegexOptions.None, new string[] { "cat \n\n\n dog" } };
- yield return new object[] { null, @"^cat\s+dog", "cat \n\n\n dog", RegexOptions.Multiline, new string[] { "cat \n\n\n dog" } };
- yield return new object[] { null, @"mouse\s\n^cat\s+dog", "mouse\n\ncat \n\n\n dog", RegexOptions.Multiline, new string[] { "mouse\n\ncat \n\n\n dog" } };
- yield return new object[] { null, @"^cat\s+dog", "cat \n\n\n dog", RegexOptions.ECMAScript, new string[] { "cat \n\n\n dog" } };
- yield return new object[] { null, @"^(cat)\s+(dog)", "cat \n\n\n dog", RegexOptions.None, new string[] { "cat \n\n\n dog", "cat", "dog" } };
- yield return new object[] { null, @"^(cat)\s+(dog)", "cat \n\n\n dog", RegexOptions.Multiline, new string[] { "cat \n\n\n dog", "cat", "dog" } };
- yield return new object[] { null, @"(mouse)\s\n^(cat)\s+(dog)", "mouse\n\ncat \n\n\n dog", RegexOptions.Multiline, new string[] { "mouse\n\ncat \n\n\n dog", "mouse", "cat", "dog" } };
- yield return new object[] { null, @"^(cat)\s+(dog)", "cat \n\n\n dog", RegexOptions.ECMAScript, new string[] { "cat \n\n\n dog", "cat", "dog" } };
-
- //\Z
- yield return new object[] { null, @"cat\s+dog\Z", "cat \n\n\n dog", RegexOptions.None, new string[] { "cat \n\n\n dog" } };
- yield return new object[] { null, @"cat\s+dog\Z", "cat \n\n\n dog", RegexOptions.Multiline, new string[] { "cat \n\n\n dog" } };
- yield return new object[] { null, @"cat\s+dog\Z", "cat \n\n\n dog", RegexOptions.ECMAScript, new string[] { "cat \n\n\n dog" } };
- yield return new object[] { null, @"cat\s+dog\Z", "cat \n\n\n dog\n", RegexOptions.None, new string[] { "cat \n\n\n dog" } };
- yield return new object[] { null, @"cat\s+dog\Z", "cat \n\n\n dog\n", RegexOptions.Multiline, new string[] { "cat \n\n\n dog" } };
- yield return new object[] { null, @"cat\s+dog\Z", "cat \n\n\n dog\n", RegexOptions.ECMAScript, new string[] { "cat \n\n\n dog" } };
- yield return new object[] { null, @"(cat)\s+(dog)\Z", "cat \n\n\n dog", RegexOptions.None, new string[] { "cat \n\n\n dog", "cat", "dog" } };
- yield return new object[] { null, @"(cat)\s+(dog)\Z", "cat \n\n\n dog", RegexOptions.Multiline, new string[] { "cat \n\n\n dog", "cat", "dog" } };
- yield return new object[] { null, @"(cat)\s+(dog)\Z", "cat \n\n\n dog", RegexOptions.ECMAScript, new string[] { "cat \n\n\n dog", "cat", "dog" } };
- yield return new object[] { null, @"(cat)\s+(dog)\Z", "cat \n\n\n dog\n", RegexOptions.None, new string[] { "cat \n\n\n dog", "cat", "dog" } };
- yield return new object[] { null, @"(cat)\s+(dog)\Z", "cat \n\n\n dog\n", RegexOptions.Multiline, new string[] { "cat \n\n\n dog", "cat", "dog" } };
- yield return new object[] { null, @"(cat)\s+(dog)\Z", "cat \n\n\n dog\n", RegexOptions.ECMAScript, new string[] { "cat \n\n\n dog", "cat", "dog" } };
-
- //\z
- yield return new object[] { null, @"cat\s+dog\z", "cat \n\n\n dog", RegexOptions.None, new string[] { "cat \n\n\n dog" } };
- yield return new object[] { null, @"cat\s+dog\z", "cat \n\n\n dog", RegexOptions.Multiline, new string[] { "cat \n\n\n dog" } };
- yield return new object[] { null, @"cat\s+dog\z", "cat \n\n\n dog", RegexOptions.ECMAScript, new string[] { "cat \n\n\n dog" } };
- yield return new object[] { null, @"(cat)\s+(dog)\z", "cat \n\n\n dog", RegexOptions.None, new string[] { "cat \n\n\n dog", "cat", "dog" } };
- yield return new object[] { null, @"(cat)\s+(dog)\z", "cat \n\n\n dog", RegexOptions.Multiline, new string[] { "cat \n\n\n dog", "cat", "dog" } };
- yield return new object[] { null, @"(cat)\s+(dog)\z", "cat \n\n\n dog", RegexOptions.ECMAScript, new string[] { "cat \n\n\n dog", "cat", "dog" } };
-
- //\b
- yield return new object[] { null, @"\bcat\b", "cat", RegexOptions.None, new string[] { "cat" } };
- yield return new object[] { null, @"\bcat\b", "dog cat mouse", RegexOptions.None, new string[] { "cat" } };
- yield return new object[] { null, @"\bcat\b", "cat", RegexOptions.ECMAScript, new string[] { "cat" } };
- yield return new object[] { null, @"\bcat\b", "dog cat mouse", RegexOptions.ECMAScript, new string[] { "cat" } };
- yield return new object[] { null, @".*\bcat\b", "cat", RegexOptions.None, new string[] { "cat" } };
- yield return new object[] { null, @".*\bcat\b", "dog cat mouse", RegexOptions.None, new string[] { "dog cat" } };
- yield return new object[] { null, @".*\bcat\b", "cat", RegexOptions.ECMAScript, new string[] { "cat" } };
- yield return new object[] { null, @".*\bcat\b", "dog cat mouse", RegexOptions.ECMAScript, new string[] { "dog cat" } };
- yield return new object[] { null, @"\b@cat", "123START123@catEND", RegexOptions.None, new string[] { "@cat" } };
- yield return new object[] { null, @"\b\<cat", "123START123<catEND", RegexOptions.None, new string[] { "<cat" } };
- yield return new object[] { null, @"\b,cat", "satwe,,,START,catEND", RegexOptions.None, new string[] { ",cat" } };
- yield return new object[] { null, @"\b\[cat", "`12START123[catEND", RegexOptions.None, new string[] { "[cat" } };
-
- //\B
- yield return new object[] { null, @"\Bcat\B", "dogcatmouse", RegexOptions.None, new string[] { "cat" } };
- yield return new object[] { null, @"dog\Bcat\B", "dogcatmouse", RegexOptions.None, new string[] { "dogcat" } };
- yield return new object[] { null, @".*\Bcat\B", "dogcatmouse", RegexOptions.None, new string[] { "dogcat" } };
- yield return new object[] { null, @"\Bcat\B", "dogcatmouse", RegexOptions.ECMAScript, new string[] { "cat" } };
- yield return new object[] { null, @"dog\Bcat\B", "dogcatmouse", RegexOptions.ECMAScript, new string[] { "dogcat" } };
- yield return new object[] { null, @".*\Bcat\B", "dogcatmouse", RegexOptions.ECMAScript, new string[] { "dogcat" } };
- yield return new object[] { null, @"\B@cat", "123START123;@catEND", RegexOptions.None, new string[] { "@cat" } };
- yield return new object[] { null, @"\B\<cat", "123START123'<catEND", RegexOptions.None, new string[] { "<cat" } };
- yield return new object[] { null, @"\B,cat", "satwe,,,START',catEND", RegexOptions.None, new string[] { ",cat" } };
- yield return new object[] { null, @"\B\[cat", "`12START123'[catEND", RegexOptions.None, new string[] { "[cat" } };
-
- // \w matching \p{Lm} (Letter, Modifier)
- yield return new object[] { null, @"\w+\s+\w+", "cat\u02b0 dog\u02b1", RegexOptions.None, new string[] { "cat\u02b0 dog\u02b1" } };
- yield return new object[] { null, @"cat\w+\s+dog\w+", "STARTcat\u30FC dog\u3005END", RegexOptions.None, new string[] { "cat\u30FC dog\u3005END" } };
- yield return new object[] { null, @"cat\w+\s+dog\w+", "STARTcat\uff9e dog\uff9fEND", RegexOptions.None, new string[] { "cat\uff9e dog\uff9fEND" } };
- yield return new object[] { null, @"(\w+)\s+(\w+)", "cat\u02b0 dog\u02b1", RegexOptions.None, new string[] { "cat\u02b0 dog\u02b1", "cat\u02b0", "dog\u02b1" } };
- yield return new object[] { null, @"(cat\w+)\s+(dog\w+)", "STARTcat\u30FC dog\u3005END", RegexOptions.None, new string[] { "cat\u30FC dog\u3005END", "cat\u30FC", "dog\u3005END" } };
- yield return new object[] { null, @"(cat\w+)\s+(dog\w+)", "STARTcat\uff9e dog\uff9fEND", RegexOptions.None, new string[] { "cat\uff9e dog\uff9fEND", "cat\uff9e", "dog\uff9fEND" } };
-
- // Positive and negative character classes [a-c]|[^b-c]
- yield return new object[] { null, @"[^a]|d", "d", RegexOptions.None, new string[] { "d" } };
- yield return new object[] { null, @"([^a]|[d])*", "Hello Worlddf", RegexOptions.None, new string[] { "Hello Worlddf", "f" } };
- yield return new object[] { null, @"([^{}]|\n)+", "{{{{Hello\n World \n}END", RegexOptions.None, new string[] { "Hello\n World \n", "\n" } };
- yield return new object[] { null, @"([a-d]|[^abcd])+", "\tonce\n upon\0 a- ()*&^%#time?", RegexOptions.None, new string[] { "\tonce\n upon\0 a- ()*&^%#time?", "?" } };
- yield return new object[] { null, @"([^a]|[a])*", "once upon a time", RegexOptions.None, new string[] { "once upon a time", "e" } };
- yield return new object[] { null, @"([a-d]|[^abcd]|[x-z]|^wxyz])+", "\tonce\n upon\0 a- ()*&^%#time?", RegexOptions.None, new string[] { "\tonce\n upon\0 a- ()*&^%#time?", "?" } };
- yield return new object[] { null, @"([a-d]|[e-i]|[^e]|wxyz])+", "\tonce\n upon\0 a- ()*&^%#time?", RegexOptions.None, new string[] { "\tonce\n upon\0 a- ()*&^%#time?", "?" } };
-
- // Canonical and noncanonical char class, where one group is in it's
- // simplest form [a-e] and another is more complex.
- yield return new object[] { null, @"^(([^b]+ )|(.* ))$", "aaa ", RegexOptions.None, new string[] { "aaa ", "aaa ", "aaa ", "" } };
- yield return new object[] { null, @"^(([^b]+ )|(.*))$", "aaa", RegexOptions.None, new string[] { "aaa", "aaa", "", "aaa" } };
- yield return new object[] { null, @"^(([^b]+ )|(.* ))$", "bbb ", RegexOptions.None, new string[] { "bbb ", "bbb ", "", "bbb " } };
- yield return new object[] { null, @"^(([^b]+ )|(.*))$", "bbb", RegexOptions.None, new string[] { "bbb", "bbb", "", "bbb" } };
- yield return new object[] { null, @"^((a*)|(.*))$", "aaa", RegexOptions.None, new string[] { "aaa", "aaa", "aaa", "" } };
- yield return new object[] { null, @"^((a*)|(.*))$", "aaabbb", RegexOptions.None, new string[] { "aaabbb", "aaabbb", "", "aaabbb" } };
-
- yield return new object[] { null, @"(([0-9])|([a-z])|([A-Z]))*", "{hello 1234567890 world}", RegexOptions.None, new string[] { "", "", "", "", "" } };
- yield return new object[] { null, @"(([0-9])|([a-z])|([A-Z]))+", "{hello 1234567890 world}", RegexOptions.None, new string[] { "hello", "o", "", "o", "" } };
- yield return new object[] { null, @"(([0-9])|([a-z])|([A-Z]))*", "{HELLO 1234567890 world}", RegexOptions.None, new string[] { "", "", "", "", "" } };
- yield return new object[] { null, @"(([0-9])|([a-z])|([A-Z]))+", "{HELLO 1234567890 world}", RegexOptions.None, new string[] { "HELLO", "O", "", "", "O" } };
- yield return new object[] { null, @"(([0-9])|([a-z])|([A-Z]))*", "{1234567890 hello world}", RegexOptions.None, new string[] { "", "", "", "", "" } };
- yield return new object[] { null, @"(([0-9])|([a-z])|([A-Z]))+", "{1234567890 hello world}", RegexOptions.None, new string[] { "1234567890", "0", "0", "", "" } };
-
- yield return new object[] { null, @"^(([a-d]*)|([a-z]*))$", "aaabbbcccdddeeefff", RegexOptions.None, new string[] { "aaabbbcccdddeeefff", "aaabbbcccdddeeefff", "", "aaabbbcccdddeeefff" } };
- yield return new object[] { null, @"^(([d-f]*)|([c-e]*))$", "dddeeeccceee", RegexOptions.None, new string[] { "dddeeeccceee", "dddeeeccceee", "", "dddeeeccceee" } };
- yield return new object[] { null, @"^(([c-e]*)|([d-f]*))$", "dddeeeccceee", RegexOptions.None, new string[] { "dddeeeccceee", "dddeeeccceee", "dddeeeccceee", "" } };
-
- // Different match in NonBackTracking when order of alternations does not matter
- yield return new object[] { null, @"(([a-d]*)|([a-z]*))", "aaabbbcccdddeeefff", RegexOptions.None, new string[] { "aaabbbcccddd", "aaabbbcccddd", "aaabbbcccddd", "" }, "aaabbbcccdddeeefff" }; // <-- Nonbacktracking match same as for "(([a-z]*)|([a-d]*))"
- yield return new object[] { null, @"(([d-f]*)|([c-e]*))", "dddeeeccceee", RegexOptions.None, new string[] { "dddeee", "dddeee", "dddeee", "" }, "dddeeeccceee" }; // <-- Nonbacktracking match same as for "(([c-e]*)|([d-f]*))"
- yield return new object[] { null, @"(([c-e]*)|([d-f]*))", "dddeeeccceee", RegexOptions.None, new string[] { "dddeeeccceee", "dddeeeccceee", "dddeeeccceee", "" } };
-
- yield return new object[] { null, @"(([a-d]*)|(.*))", "aaabbbcccdddeeefff", RegexOptions.None, new string[] { "aaabbbcccddd", "aaabbbcccddd", "aaabbbcccddd", "" }, "aaabbbcccdddeeefff" }; // <-- Nonbacktracking match same as for ".*"
- yield return new object[] { null, @"(([d-f]*)|(.*))", "dddeeeccceee", RegexOptions.None, new string[] { "dddeee", "dddeee", "dddeee", "" }, "dddeeeccceee" }; // <-- Nonbacktracking match same as for ".*"
- yield return new object[] { null, @"(([c-e]*)|(.*))", "dddeeeccceee", RegexOptions.None, new string[] { "dddeeeccceee", "dddeeeccceee", "dddeeeccceee", "" } };
-
- // \p{Pi} (Punctuation Initial quote) \p{Pf} (Punctuation Final quote)
- yield return new object[] { null, @"\p{Pi}(\w*)\p{Pf}", "\u00ABCat\u00BB \u00BBDog\u00AB'", RegexOptions.None, new string[] { "\u00ABCat\u00BB", "Cat" } };
- yield return new object[] { null, @"\p{Pi}(\w*)\p{Pf}", "\u2018Cat\u2019 \u2019Dog\u2018'", RegexOptions.None, new string[] { "\u2018Cat\u2019", "Cat" } };
-
- // ECMAScript
- yield return new object[] { null, @"(?<cat>cat)\s+(?<dog>dog)\s+\123\s+\234", "asdfcat dog cat23 dog34eia", RegexOptions.ECMAScript, new string[] { "cat dog cat23 dog34", "cat", "dog" } };
-
- // Balanced Matching
- yield return new object[] { null, @"<div>
+ // Atomic Zero-Width Assertions \A \G ^ \Z \z \b \B
+ //\A
+ yield return new object[] { engine, null, @"\Acat\s+dog", "cat \n\n\n dog", RegexOptions.None, new string[] { "cat \n\n\n dog" } };
+ yield return new object[] { engine, null, @"\Acat\s+dog", "cat \n\n\n dog", RegexOptions.Multiline, new string[] { "cat \n\n\n dog" } };
+ yield return new object[] { engine, null, @"\A(cat)\s+(dog)", "cat \n\n\n dog", RegexOptions.None, new string[] { "cat \n\n\n dog", "cat", "dog" } };
+ yield return new object[] { engine, null, @"\A(cat)\s+(dog)", "cat \n\n\n dog", RegexOptions.Multiline, new string[] { "cat \n\n\n dog", "cat", "dog" } };
+
+ //\G
+ yield return new object[] { engine, null, @"\Gcat\s+dog", "cat \n\n\n dog", RegexOptions.None, new string[] { "cat \n\n\n dog" } };
+ yield return new object[] { engine, null, @"\Gcat\s+dog", "cat \n\n\n dog", RegexOptions.Multiline, new string[] { "cat \n\n\n dog" } };
+ yield return new object[] { engine, null, @"\Gcat\s+dog", "cat \n\n\n dog", RegexOptions.ECMAScript, new string[] { "cat \n\n\n dog" } };
+ yield return new object[] { engine, null, @"\G(cat)\s+(dog)", "cat \n\n\n dog", RegexOptions.None, new string[] { "cat \n\n\n dog", "cat", "dog" } };
+ yield return new object[] { engine, null, @"\G(cat)\s+(dog)", "cat \n\n\n dog", RegexOptions.Multiline, new string[] { "cat \n\n\n dog", "cat", "dog" } };
+ yield return new object[] { engine, null, @"\G(cat)\s+(dog)", "cat \n\n\n dog", RegexOptions.ECMAScript, new string[] { "cat \n\n\n dog", "cat", "dog" } };
+
+ //^
+ yield return new object[] { engine, null, @"^cat\s+dog", "cat \n\n\n dog", RegexOptions.None, new string[] { "cat \n\n\n dog" } };
+ yield return new object[] { engine, null, @"^cat\s+dog", "cat \n\n\n dog", RegexOptions.Multiline, new string[] { "cat \n\n\n dog" } };
+ yield return new object[] { engine, null, @"mouse\s\n^cat\s+dog", "mouse\n\ncat \n\n\n dog", RegexOptions.Multiline, new string[] { "mouse\n\ncat \n\n\n dog" } };
+ yield return new object[] { engine, null, @"^cat\s+dog", "cat \n\n\n dog", RegexOptions.ECMAScript, new string[] { "cat \n\n\n dog" } };
+ yield return new object[] { engine, null, @"^(cat)\s+(dog)", "cat \n\n\n dog", RegexOptions.None, new string[] { "cat \n\n\n dog", "cat", "dog" } };
+ yield return new object[] { engine, null, @"^(cat)\s+(dog)", "cat \n\n\n dog", RegexOptions.Multiline, new string[] { "cat \n\n\n dog", "cat", "dog" } };
+ yield return new object[] { engine, null, @"(mouse)\s\n^(cat)\s+(dog)", "mouse\n\ncat \n\n\n dog", RegexOptions.Multiline, new string[] { "mouse\n\ncat \n\n\n dog", "mouse", "cat", "dog" } };
+ yield return new object[] { engine, null, @"^(cat)\s+(dog)", "cat \n\n\n dog", RegexOptions.ECMAScript, new string[] { "cat \n\n\n dog", "cat", "dog" } };
+
+ //\Z
+ yield return new object[] { engine, null, @"cat\s+dog\Z", "cat \n\n\n dog", RegexOptions.None, new string[] { "cat \n\n\n dog" } };
+ yield return new object[] { engine, null, @"cat\s+dog\Z", "cat \n\n\n dog", RegexOptions.Multiline, new string[] { "cat \n\n\n dog" } };
+ yield return new object[] { engine, null, @"cat\s+dog\Z", "cat \n\n\n dog", RegexOptions.ECMAScript, new string[] { "cat \n\n\n dog" } };
+ yield return new object[] { engine, null, @"cat\s+dog\Z", "cat \n\n\n dog\n", RegexOptions.None, new string[] { "cat \n\n\n dog" } };
+ yield return new object[] { engine, null, @"cat\s+dog\Z", "cat \n\n\n dog\n", RegexOptions.Multiline, new string[] { "cat \n\n\n dog" } };
+ yield return new object[] { engine, null, @"cat\s+dog\Z", "cat \n\n\n dog\n", RegexOptions.ECMAScript, new string[] { "cat \n\n\n dog" } };
+ yield return new object[] { engine, null, @"(cat)\s+(dog)\Z", "cat \n\n\n dog", RegexOptions.None, new string[] { "cat \n\n\n dog", "cat", "dog" } };
+ yield return new object[] { engine, null, @"(cat)\s+(dog)\Z", "cat \n\n\n dog", RegexOptions.Multiline, new string[] { "cat \n\n\n dog", "cat", "dog" } };
+ yield return new object[] { engine, null, @"(cat)\s+(dog)\Z", "cat \n\n\n dog", RegexOptions.ECMAScript, new string[] { "cat \n\n\n dog", "cat", "dog" } };
+ yield return new object[] { engine, null, @"(cat)\s+(dog)\Z", "cat \n\n\n dog\n", RegexOptions.None, new string[] { "cat \n\n\n dog", "cat", "dog" } };
+ yield return new object[] { engine, null, @"(cat)\s+(dog)\Z", "cat \n\n\n dog\n", RegexOptions.Multiline, new string[] { "cat \n\n\n dog", "cat", "dog" } };
+ yield return new object[] { engine, null, @"(cat)\s+(dog)\Z", "cat \n\n\n dog\n", RegexOptions.ECMAScript, new string[] { "cat \n\n\n dog", "cat", "dog" } };
+
+ //\z
+ yield return new object[] { engine, null, @"cat\s+dog\z", "cat \n\n\n dog", RegexOptions.None, new string[] { "cat \n\n\n dog" } };
+ yield return new object[] { engine, null, @"cat\s+dog\z", "cat \n\n\n dog", RegexOptions.Multiline, new string[] { "cat \n\n\n dog" } };
+ yield return new object[] { engine, null, @"cat\s+dog\z", "cat \n\n\n dog", RegexOptions.ECMAScript, new string[] { "cat \n\n\n dog" } };
+ yield return new object[] { engine, null, @"(cat)\s+(dog)\z", "cat \n\n\n dog", RegexOptions.None, new string[] { "cat \n\n\n dog", "cat", "dog" } };
+ yield return new object[] { engine, null, @"(cat)\s+(dog)\z", "cat \n\n\n dog", RegexOptions.Multiline, new string[] { "cat \n\n\n dog", "cat", "dog" } };
+ yield return new object[] { engine, null, @"(cat)\s+(dog)\z", "cat \n\n\n dog", RegexOptions.ECMAScript, new string[] { "cat \n\n\n dog", "cat", "dog" } };
+
+ //\b
+ yield return new object[] { engine, null, @"\bcat\b", "cat", RegexOptions.None, new string[] { "cat" } };
+ yield return new object[] { engine, null, @"\bcat\b", "dog cat mouse", RegexOptions.None, new string[] { "cat" } };
+ yield return new object[] { engine, null, @"\bcat\b", "cat", RegexOptions.ECMAScript, new string[] { "cat" } };
+ yield return new object[] { engine, null, @"\bcat\b", "dog cat mouse", RegexOptions.ECMAScript, new string[] { "cat" } };
+ yield return new object[] { engine, null, @".*\bcat\b", "cat", RegexOptions.None, new string[] { "cat" } };
+ yield return new object[] { engine, null, @".*\bcat\b", "dog cat mouse", RegexOptions.None, new string[] { "dog cat" } };
+ yield return new object[] { engine, null, @".*\bcat\b", "cat", RegexOptions.ECMAScript, new string[] { "cat" } };
+ yield return new object[] { engine, null, @".*\bcat\b", "dog cat mouse", RegexOptions.ECMAScript, new string[] { "dog cat" } };
+ yield return new object[] { engine, null, @"\b@cat", "123START123@catEND", RegexOptions.None, new string[] { "@cat" } };
+ yield return new object[] { engine, null, @"\b\<cat", "123START123<catEND", RegexOptions.None, new string[] { "<cat" } };
+ yield return new object[] { engine, null, @"\b,cat", "satwe,,,START,catEND", RegexOptions.None, new string[] { ",cat" } };
+ yield return new object[] { engine, null, @"\b\[cat", "`12START123[catEND", RegexOptions.None, new string[] { "[cat" } };
+
+ //\B
+ yield return new object[] { engine, null, @"\Bcat\B", "dogcatmouse", RegexOptions.None, new string[] { "cat" } };
+ yield return new object[] { engine, null, @"dog\Bcat\B", "dogcatmouse", RegexOptions.None, new string[] { "dogcat" } };
+ yield return new object[] { engine, null, @".*\Bcat\B", "dogcatmouse", RegexOptions.None, new string[] { "dogcat" } };
+ yield return new object[] { engine, null, @"\Bcat\B", "dogcatmouse", RegexOptions.ECMAScript, new string[] { "cat" } };
+ yield return new object[] { engine, null, @"dog\Bcat\B", "dogcatmouse", RegexOptions.ECMAScript, new string[] { "dogcat" } };
+ yield return new object[] { engine, null, @".*\Bcat\B", "dogcatmouse", RegexOptions.ECMAScript, new string[] { "dogcat" } };
+ yield return new object[] { engine, null, @"\B@cat", "123START123;@catEND", RegexOptions.None, new string[] { "@cat" } };
+ yield return new object[] { engine, null, @"\B\<cat", "123START123'<catEND", RegexOptions.None, new string[] { "<cat" } };
+ yield return new object[] { engine, null, @"\B,cat", "satwe,,,START',catEND", RegexOptions.None, new string[] { ",cat" } };
+ yield return new object[] { engine, null, @"\B\[cat", "`12START123'[catEND", RegexOptions.None, new string[] { "[cat" } };
+
+ // \w matching \p{Lm} (Letter, Modifier)
+ yield return new object[] { engine, null, @"\w+\s+\w+", "cat\u02b0 dog\u02b1", RegexOptions.None, new string[] { "cat\u02b0 dog\u02b1" } };
+ yield return new object[] { engine, null, @"cat\w+\s+dog\w+", "STARTcat\u30FC dog\u3005END", RegexOptions.None, new string[] { "cat\u30FC dog\u3005END" } };
+ yield return new object[] { engine, null, @"cat\w+\s+dog\w+", "STARTcat\uff9e dog\uff9fEND", RegexOptions.None, new string[] { "cat\uff9e dog\uff9fEND" } };
+ yield return new object[] { engine, null, @"(\w+)\s+(\w+)", "cat\u02b0 dog\u02b1", RegexOptions.None, new string[] { "cat\u02b0 dog\u02b1", "cat\u02b0", "dog\u02b1" } };
+ yield return new object[] { engine, null, @"(cat\w+)\s+(dog\w+)", "STARTcat\u30FC dog\u3005END", RegexOptions.None, new string[] { "cat\u30FC dog\u3005END", "cat\u30FC", "dog\u3005END" } };
+ yield return new object[] { engine, null, @"(cat\w+)\s+(dog\w+)", "STARTcat\uff9e dog\uff9fEND", RegexOptions.None, new string[] { "cat\uff9e dog\uff9fEND", "cat\uff9e", "dog\uff9fEND" } };
+
+ // Positive and negative character classes [a-c]|[^b-c]
+ yield return new object[] { engine, null, @"[^a]|d", "d", RegexOptions.None, new string[] { "d" } };
+ yield return new object[] { engine, null, @"([^a]|[d])*", "Hello Worlddf", RegexOptions.None, new string[] { "Hello Worlddf", "f" } };
+ yield return new object[] { engine, null, @"([^{}]|\n)+", "{{{{Hello\n World \n}END", RegexOptions.None, new string[] { "Hello\n World \n", "\n" } };
+ yield return new object[] { engine, null, @"([a-d]|[^abcd])+", "\tonce\n upon\0 a- ()*&^%#time?", RegexOptions.None, new string[] { "\tonce\n upon\0 a- ()*&^%#time?", "?" } };
+ yield return new object[] { engine, null, @"([^a]|[a])*", "once upon a time", RegexOptions.None, new string[] { "once upon a time", "e" } };
+ yield return new object[] { engine, null, @"([a-d]|[^abcd]|[x-z]|^wxyz])+", "\tonce\n upon\0 a- ()*&^%#time?", RegexOptions.None, new string[] { "\tonce\n upon\0 a- ()*&^%#time?", "?" } };
+ yield return new object[] { engine, null, @"([a-d]|[e-i]|[^e]|wxyz])+", "\tonce\n upon\0 a- ()*&^%#time?", RegexOptions.None, new string[] { "\tonce\n upon\0 a- ()*&^%#time?", "?" } };
+
+ // Canonical and noncanonical char class, where one group is in it's
+ // simplest form [a-e] and another is more complex.
+ yield return new object[] { engine, null, @"^(([^b]+ )|(.* ))$", "aaa ", RegexOptions.None, new string[] { "aaa ", "aaa ", "aaa ", "" } };
+ yield return new object[] { engine, null, @"^(([^b]+ )|(.*))$", "aaa", RegexOptions.None, new string[] { "aaa", "aaa", "", "aaa" } };
+ yield return new object[] { engine, null, @"^(([^b]+ )|(.* ))$", "bbb ", RegexOptions.None, new string[] { "bbb ", "bbb ", "", "bbb " } };
+ yield return new object[] { engine, null, @"^(([^b]+ )|(.*))$", "bbb", RegexOptions.None, new string[] { "bbb", "bbb", "", "bbb" } };
+ yield return new object[] { engine, null, @"^((a*)|(.*))$", "aaa", RegexOptions.None, new string[] { "aaa", "aaa", "aaa", "" } };
+ yield return new object[] { engine, null, @"^((a*)|(.*))$", "aaabbb", RegexOptions.None, new string[] { "aaabbb", "aaabbb", "", "aaabbb" } };
+
+ yield return new object[] { engine, null, @"(([0-9])|([a-z])|([A-Z]))*", "{hello 1234567890 world}", RegexOptions.None, new string[] { "", "", "", "", "" } };
+ yield return new object[] { engine, null, @"(([0-9])|([a-z])|([A-Z]))+", "{hello 1234567890 world}", RegexOptions.None, new string[] { "hello", "o", "", "o", "" } };
+ yield return new object[] { engine, null, @"(([0-9])|([a-z])|([A-Z]))*", "{HELLO 1234567890 world}", RegexOptions.None, new string[] { "", "", "", "", "" } };
+ yield return new object[] { engine, null, @"(([0-9])|([a-z])|([A-Z]))+", "{HELLO 1234567890 world}", RegexOptions.None, new string[] { "HELLO", "O", "", "", "O" } };
+ yield return new object[] { engine, null, @"(([0-9])|([a-z])|([A-Z]))*", "{1234567890 hello world}", RegexOptions.None, new string[] { "", "", "", "", "" } };
+ yield return new object[] { engine, null, @"(([0-9])|([a-z])|([A-Z]))+", "{1234567890 hello world}", RegexOptions.None, new string[] { "1234567890", "0", "0", "", "" } };
+
+ yield return new object[] { engine, null, @"^(([a-d]*)|([a-z]*))$", "aaabbbcccdddeeefff", RegexOptions.None, new string[] { "aaabbbcccdddeeefff", "aaabbbcccdddeeefff", "", "aaabbbcccdddeeefff" } };
+ yield return new object[] { engine, null, @"^(([d-f]*)|([c-e]*))$", "dddeeeccceee", RegexOptions.None, new string[] { "dddeeeccceee", "dddeeeccceee", "", "dddeeeccceee" } };
+ yield return new object[] { engine, null, @"^(([c-e]*)|([d-f]*))$", "dddeeeccceee", RegexOptions.None, new string[] { "dddeeeccceee", "dddeeeccceee", "dddeeeccceee", "" } };
+
+ // Different match in NonBackTracking when order of alternations does not matter
+ yield return new object[] { engine, null, @"(([a-d]*)|([a-z]*))", "aaabbbcccdddeeefff", RegexOptions.None, new string[] { "aaabbbcccddd", "aaabbbcccddd", "aaabbbcccddd", "" }, "aaabbbcccdddeeefff" }; // <-- Nonbacktracking match same as for "(([a-z]*)|([a-d]*))"
+ yield return new object[] { engine, null, @"(([d-f]*)|([c-e]*))", "dddeeeccceee", RegexOptions.None, new string[] { "dddeee", "dddeee", "dddeee", "" }, "dddeeeccceee" }; // <-- Nonbacktracking match same as for "(([c-e]*)|([d-f]*))"
+ yield return new object[] { engine, null, @"(([c-e]*)|([d-f]*))", "dddeeeccceee", RegexOptions.None, new string[] { "dddeeeccceee", "dddeeeccceee", "dddeeeccceee", "" } };
+
+ yield return new object[] { engine, null, @"(([a-d]*)|(.*))", "aaabbbcccdddeeefff", RegexOptions.None, new string[] { "aaabbbcccddd", "aaabbbcccddd", "aaabbbcccddd", "" }, "aaabbbcccdddeeefff" }; // <-- Nonbacktracking match same as for ".*"
+ yield return new object[] { engine, null, @"(([d-f]*)|(.*))", "dddeeeccceee", RegexOptions.None, new string[] { "dddeee", "dddeee", "dddeee", "" }, "dddeeeccceee" }; // <-- Nonbacktracking match same as for ".*"
+ yield return new object[] { engine, null, @"(([c-e]*)|(.*))", "dddeeeccceee", RegexOptions.None, new string[] { "dddeeeccceee", "dddeeeccceee", "dddeeeccceee", "" } };
+
+ // \p{Pi} (Punctuation Initial quote) \p{Pf} (Punctuation Final quote)
+ yield return new object[] { engine, null, @"\p{Pi}(\w*)\p{Pf}", "\u00ABCat\u00BB \u00BBDog\u00AB'", RegexOptions.None, new string[] { "\u00ABCat\u00BB", "Cat" } };
+ yield return new object[] { engine, null, @"\p{Pi}(\w*)\p{Pf}", "\u2018Cat\u2019 \u2019Dog\u2018'", RegexOptions.None, new string[] { "\u2018Cat\u2019", "Cat" } };
+
+ // ECMAScript
+ yield return new object[] { engine, null, @"(?<cat>cat)\s+(?<dog>dog)\s+\123\s+\234", "asdfcat dog cat23 dog34eia", RegexOptions.ECMAScript, new string[] { "cat dog cat23 dog34", "cat", "dog" } };
+
+ // Balanced Matching
+ yield return new object[] { engine, null, @"<div>
(?>
<div>(?<DEPTH>) |
</div> (?<-DEPTH>) |
(?(DEPTH)(?!))
</div>", "<div>this is some <div>red</div> text</div></div></div>", RegexOptions.IgnorePatternWhitespace, new string[] { "<div>this is some <div>red</div> text</div>", "" } };
- yield return new object[] { null, @"(
+ yield return new object[] { engine, null, @"(
((?'open'<+)[^<>]*)+
((?'close-open'>+)[^<>]*)+
)+", "<01deep_01<02deep_01<03deep_01>><02deep_02><02deep_03<03deep_03>>>", RegexOptions.IgnorePatternWhitespace, new string[] { "<01deep_01<02deep_01<03deep_01>><02deep_02><02deep_03<03deep_03>>>", "<02deep_03<03deep_03>>>", "<03deep_03", ">>>", "<", "03deep_03" } };
- yield return new object[] { null, @"(
+ yield return new object[] { engine, null, @"(
(?<start><)?
[^<>]?
(?<end-start>>)?
)*", "<01deep_01<02deep_01<03deep_01>><02deep_02><02deep_03<03deep_03>>>", RegexOptions.IgnorePatternWhitespace, new string[] { "<01deep_01<02deep_01<03deep_01>><02deep_02><02deep_03<03deep_03>>>", "", "", "01deep_01<02deep_01<03deep_01>><02deep_02><02deep_03<03deep_03>>" } };
- yield return new object[] { null, @"(
+ yield return new object[] { engine, null, @"(
(?<start><[^/<>]*>)?
[^<>]?
(?<end-start></[^/<>]*>)?
)*", "<b><a>Cat</a></b>", RegexOptions.IgnorePatternWhitespace, new string[] { "<b><a>Cat</a></b>", "", "", "<a>Cat</a>" } };
- yield return new object[] { null, @"(
+ yield return new object[] { engine, null, @"(
(?<start><(?<TagName>[^/<>]*)>)?
[^<>]?
(?<end-start></\k<TagName>>)?
)*", "<b>cat</b><a>dog</a>", RegexOptions.IgnorePatternWhitespace, new string[] { "<b>cat</b><a>dog</a>", "", "", "a", "dog" } };
- // Balanced Matching With Backtracking
- yield return new object[] { null, @"(
+ // Balanced Matching With Backtracking
+ yield return new object[] { engine, null, @"(
(?<start><[^/<>]*>)?
.?
(?<end-start></[^/<>]*>)?
)*
(?(start)(?!)) ", "<b><a>Cat</a></b><<<<c>>>><<d><e<f>><g><<<>>>>", RegexOptions.IgnorePatternWhitespace, new string[] { "<b><a>Cat</a></b><<<<c>>>><<d><e<f>><g><<<>>>>", "", "", "<a>Cat" } };
- // Character Classes and Lazy quantifier
- yield return new object[] { null, @"([0-9]+?)([\w]+?)", "55488aheiaheiad", RegexOptions.ECMAScript, new string[] { "55", "5", "5" } };
- yield return new object[] { null, @"([0-9]+?)([a-z]+?)", "55488aheiaheiad", RegexOptions.ECMAScript, new string[] { "55488a", "55488", "a" } };
+ // Character Classes and Lazy quantifier
+ yield return new object[] { engine, null, @"([0-9]+?)([\w]+?)", "55488aheiaheiad", RegexOptions.ECMAScript, new string[] { "55", "5", "5" } };
+ yield return new object[] { engine, null, @"([0-9]+?)([a-z]+?)", "55488aheiaheiad", RegexOptions.ECMAScript, new string[] { "55488a", "55488", "a" } };
- // Miscellaneous/Regression scenarios
- yield return new object[] { null, @"(?<openingtag>1)(?<content>.*?)(?=2)", "1" + Environment.NewLine + "<Projecaa DefaultTargets=\"x\"/>" + Environment.NewLine + "2", RegexOptions.Singleline | RegexOptions.ExplicitCapture,
+ // Miscellaneous/Regression scenarios
+ yield return new object[] { engine, null, @"(?<openingtag>1)(?<content>.*?)(?=2)", "1" + Environment.NewLine + "<Projecaa DefaultTargets=\"x\"/>" + Environment.NewLine + "2", RegexOptions.Singleline | RegexOptions.ExplicitCapture,
new string[] { "1" + Environment.NewLine + "<Projecaa DefaultTargets=\"x\"/>" + Environment.NewLine, "1", Environment.NewLine + "<Projecaa DefaultTargets=\"x\"/>"+ Environment.NewLine } };
- yield return new object[] { null, @"\G<%#(?<code>.*?)?%>", @"<%# DataBinder.Eval(this, ""MyNumber"") %>", RegexOptions.Singleline, new string[] { @"<%# DataBinder.Eval(this, ""MyNumber"") %>", @" DataBinder.Eval(this, ""MyNumber"") " } };
-
- // Nested Quantifiers
- yield return new object[] { null, @"^[abcd]{0,0x10}*$", "a{0,0x10}}}", RegexOptions.None, new string[] { "a{0,0x10}}}" } };
-
- // Lazy operator Backtracking
- yield return new object[] { null, @"http://([a-zA-z0-9\-]*\.?)*?(:[0-9]*)??/", "http://www.msn.com/", RegexOptions.IgnoreCase, new string[] { "http://www.msn.com/", "com", string.Empty } };
- yield return new object[] { null, @"http://([a-zA-Z0-9\-]*\.?)*?/", @"http://www.google.com/", RegexOptions.IgnoreCase, new string[] { "http://www.google.com/", "com" } };
-
- yield return new object[] { null, @"([a-z]*?)([\w])", "cat", RegexOptions.IgnoreCase, new string[] { "c", string.Empty, "c" } };
- yield return new object[] { null, @"^([a-z]*?)([\w])$", "cat", RegexOptions.IgnoreCase, new string[] { "cat", "ca", "t" } };
-
- // Backtracking
- yield return new object[] { null, @"([a-z]*)([\w])", "cat", RegexOptions.IgnoreCase, new string[] { "cat", "ca", "t" } };
- yield return new object[] { null, @"^([a-z]*)([\w])$", "cat", RegexOptions.IgnoreCase, new string[] { "cat", "ca", "t" } };
-
- // Backtracking with multiple (.*) groups -- important ASP.NET scenario
- yield return new object[] { null, @"(.*)/(.*).aspx", "/.aspx", RegexOptions.None, new string[] { "/.aspx", string.Empty, string.Empty } };
- yield return new object[] { null, @"(.*)/(.*).aspx", "/homepage.aspx", RegexOptions.None, new string[] { "/homepage.aspx", string.Empty, "homepage" } };
- yield return new object[] { null, @"(.*)/(.*).aspx", "pages/.aspx", RegexOptions.None, new string[] { "pages/.aspx", "pages", string.Empty } };
- yield return new object[] { null, @"(.*)/(.*).aspx", "pages/homepage.aspx", RegexOptions.None, new string[] { "pages/homepage.aspx", "pages", "homepage" } };
- yield return new object[] { null, @"(.*)/(.*).aspx", "/pages/homepage.aspx", RegexOptions.None, new string[] { "/pages/homepage.aspx", "/pages", "homepage" } };
- yield return new object[] { null, @"(.*)/(.*).aspx", "/pages/homepage/index.aspx", RegexOptions.None, new string[] { "/pages/homepage/index.aspx", "/pages/homepage", "index" } };
- yield return new object[] { null, @"(.*)/(.*).aspx", "/pages/homepage.aspx/index.aspx", RegexOptions.None, new string[] { "/pages/homepage.aspx/index.aspx", "/pages/homepage.aspx", "index" } };
- yield return new object[] { null, @"(.*)/(.*)/(.*).aspx", "/pages/homepage.aspx/index.aspx", RegexOptions.None, new string[] { "/pages/homepage.aspx/index.aspx", "/pages", "homepage.aspx", "index" } };
-
- // Backtracking with multiple (.+) groups
- yield return new object[] { null, @"(.+)/(.+).aspx", "pages/homepage.aspx", RegexOptions.None, new string[] { "pages/homepage.aspx", "pages", "homepage" } };
- yield return new object[] { null, @"(.+)/(.+).aspx", "/pages/homepage.aspx", RegexOptions.None, new string[] { "/pages/homepage.aspx", "/pages", "homepage" } };
- yield return new object[] { null, @"(.+)/(.+).aspx", "/pages/homepage/index.aspx", RegexOptions.None, new string[] { "/pages/homepage/index.aspx", "/pages/homepage", "index" } };
- yield return new object[] { null, @"(.+)/(.+).aspx", "/pages/homepage.aspx/index.aspx", RegexOptions.None, new string[] { "/pages/homepage.aspx/index.aspx", "/pages/homepage.aspx", "index" } };
- yield return new object[] { null, @"(.+)/(.+)/(.+).aspx", "/pages/homepage.aspx/index.aspx", RegexOptions.None, new string[] { "/pages/homepage.aspx/index.aspx", "/pages", "homepage.aspx", "index" } };
-
- // Backtracking with (.+) group followed by (.*)
- yield return new object[] { null, @"(.+)/(.*).aspx", "pages/.aspx", RegexOptions.None, new string[] { "pages/.aspx", "pages", string.Empty } };
- yield return new object[] { null, @"(.+)/(.*).aspx", "pages/homepage.aspx", RegexOptions.None, new string[] { "pages/homepage.aspx", "pages", "homepage" } };
- yield return new object[] { null, @"(.+)/(.*).aspx", "/pages/homepage.aspx", RegexOptions.None, new string[] { "/pages/homepage.aspx", "/pages", "homepage" } };
- yield return new object[] { null, @"(.+)/(.*).aspx", "/pages/homepage/index.aspx", RegexOptions.None, new string[] { "/pages/homepage/index.aspx", "/pages/homepage", "index" } };
- yield return new object[] { null, @"(.+)/(.*).aspx", "/pages/homepage.aspx/index.aspx", RegexOptions.None, new string[] { "/pages/homepage.aspx/index.aspx", "/pages/homepage.aspx", "index" } };
- yield return new object[] { null, @"(.+)/(.*)/(.*).aspx", "/pages/homepage.aspx/index.aspx", RegexOptions.None, new string[] { "/pages/homepage.aspx/index.aspx", "/pages", "homepage.aspx", "index" } };
-
- // Backtracking with (.*) group followed by (.+)
- yield return new object[] { null, @"(.*)/(.+).aspx", "/homepage.aspx", RegexOptions.None, new string[] { "/homepage.aspx", string.Empty, "homepage" } };
- yield return new object[] { null, @"(.*)/(.+).aspx", "pages/homepage.aspx", RegexOptions.None, new string[] { "pages/homepage.aspx", "pages", "homepage" } };
- yield return new object[] { null, @"(.*)/(.+).aspx", "/pages/homepage.aspx", RegexOptions.None, new string[] { "/pages/homepage.aspx", "/pages", "homepage" } };
- yield return new object[] { null, @"(.*)/(.+).aspx", "/pages/homepage/index.aspx", RegexOptions.None, new string[] { "/pages/homepage/index.aspx", "/pages/homepage", "index" } };
- yield return new object[] { null, @"(.*)/(.+).aspx", "/pages/homepage.aspx/index.aspx", RegexOptions.None, new string[] { "/pages/homepage.aspx/index.aspx", "/pages/homepage.aspx", "index" } };
- yield return new object[] { null, @"(.*)/(.+)/(.+).aspx", "/pages/homepage.aspx/index.aspx", RegexOptions.None, new string[] { "/pages/homepage.aspx/index.aspx", "/pages", "homepage.aspx", "index" } };
-
- // Quantifiers
- yield return new object[] { null, @"a*", "", RegexOptions.None, new string[] { "" } };
- yield return new object[] { null, @"a*", "a", RegexOptions.None, new string[] { "a" } };
- yield return new object[] { null, @"a*", "aa", RegexOptions.None, new string[] { "aa" } };
- yield return new object[] { null, @"a*", "aaa", RegexOptions.None, new string[] { "aaa" } };
- yield return new object[] { null, @"a*?", "", RegexOptions.None, new string[] { "" } };
- yield return new object[] { null, @"a*?", "a", RegexOptions.None, new string[] { "" } };
- yield return new object[] { null, @"a*?", "aa", RegexOptions.None, new string[] { "" } };
- yield return new object[] { null, @"a+?", "aa", RegexOptions.None, new string[] { "a" } };
- yield return new object[] { null, @"a{1,", "a{1,", RegexOptions.None, new string[] { "a{1," } };
- yield return new object[] { null, @"a{1,3}", "aaaaa", RegexOptions.None, new string[] { "aaa" } };
- yield return new object[] { null, @"a{1,3}?", "aaaaa", RegexOptions.None, new string[] { "a" } };
- yield return new object[] { null, @"a{2,2}", "aaaaa", RegexOptions.None, new string[] { "aa" } };
- yield return new object[] { null, @"a{2,2}?", "aaaaa", RegexOptions.None, new string[] { "aa" } };
- yield return new object[] { null, @".{1,3}", "bb\nba", RegexOptions.None, new string[] { "bb" } };
- yield return new object[] { null, @".{1,3}?", "bb\nba", RegexOptions.None, new string[] { "b" } };
- yield return new object[] { null, @".{2,2}", "bbb\nba", RegexOptions.None, new string[] { "bb" } };
- yield return new object[] { null, @".{2,2}?", "bbb\nba", RegexOptions.None, new string[] { "bb" } };
- yield return new object[] { null, @"[abc]{1,3}", "ccaba", RegexOptions.None, new string[] { "cca" } };
- yield return new object[] { null, @"[abc]{1,3}?", "ccaba", RegexOptions.None, new string[] { "c" } };
- yield return new object[] { null, @"[abc]{2,2}", "ccaba", RegexOptions.None, new string[] { "cc" } };
- yield return new object[] { null, @"[abc]{2,2}?", "ccaba", RegexOptions.None, new string[] { "cc" } };
- yield return new object[] { null, @"(?:[abc]def){1,3}xyz", "cdefxyz", RegexOptions.None, new string[] { "cdefxyz" } };
- yield return new object[] { null, @"(?:[abc]def){1,3}xyz", "adefbdefcdefxyz", RegexOptions.None, new string[] { "adefbdefcdefxyz" } };
- yield return new object[] { null, @"(?:[abc]def){1,3}?xyz", "cdefxyz", RegexOptions.None, new string[] { "cdefxyz" } };
- yield return new object[] { null, @"(?:[abc]def){1,3}?xyz", "adefbdefcdefxyz", RegexOptions.None, new string[] { "adefbdefcdefxyz" } };
- yield return new object[] { null, @"(?:[abc]def){2,2}xyz", "adefbdefcdefxyz", RegexOptions.None, new string[] { "bdefcdefxyz" } };
- yield return new object[] { null, @"(?:[abc]def){2,2}?xyz", "adefbdefcdefxyz", RegexOptions.None, new string[] { "bdefcdefxyz" } };
- foreach (string prefix in new[] { "", "xyz" })
- {
- yield return new object[] { null, prefix + @"(?:[abc]def){1,3}", prefix + "cdef", RegexOptions.None, new string[] { prefix + "cdef" } };
- yield return new object[] { null, prefix + @"(?:[abc]def){1,3}", prefix + "cdefadefbdef", RegexOptions.None, new string[] { prefix + "cdefadefbdef" } };
- yield return new object[] { null, prefix + @"(?:[abc]def){1,3}", prefix + "cdefadefbdefadef", RegexOptions.None, new string[] { prefix + "cdefadefbdef" } };
- yield return new object[] { null, prefix + @"(?:[abc]def){1,3}?", prefix + "cdef", RegexOptions.None, new string[] { prefix + "cdef" } };
- yield return new object[] { null, prefix + @"(?:[abc]def){1,3}?", prefix + "cdefadefbdef", RegexOptions.None, new string[] { prefix + "cdef" } };
- yield return new object[] { null, prefix + @"(?:[abc]def){2,2}", prefix + "cdefadefbdefadef", RegexOptions.None, new string[] { prefix + "cdefadef" } };
- yield return new object[] { null, prefix + @"(?:[abc]def){2,2}?", prefix + "cdefadefbdefadef", RegexOptions.None, new string[] { prefix + "cdefadef" } };
- }
- yield return new object[] { null, @"(cat){", "cat{", RegexOptions.None, new string[] { "cat{", "cat" } };
- yield return new object[] { null, @"(cat){}", "cat{}", RegexOptions.None, new string[] { "cat{}", "cat" } };
- yield return new object[] { null, @"(cat){,", "cat{,", RegexOptions.None, new string[] { "cat{,", "cat" } };
- yield return new object[] { null, @"(cat){,}", "cat{,}", RegexOptions.None, new string[] { "cat{,}", "cat" } };
- yield return new object[] { null, @"(cat){cat}", "cat{cat}", RegexOptions.None, new string[] { "cat{cat}", "cat" } };
- yield return new object[] { null, @"(cat){cat,5}", "cat{cat,5}", RegexOptions.None, new string[] { "cat{cat,5}", "cat" } };
- yield return new object[] { null, @"(cat){5,dog}", "cat{5,dog}", RegexOptions.None, new string[] { "cat{5,dog}", "cat" } };
- yield return new object[] { null, @"(cat){cat,dog}", "cat{cat,dog}", RegexOptions.None, new string[] { "cat{cat,dog}", "cat" } };
- yield return new object[] { null, @"(cat){,}?", "cat{,}?", RegexOptions.None, new string[] { "cat{,}", "cat" } };
- yield return new object[] { null, @"(cat){cat}?", "cat{cat}?", RegexOptions.None, new string[] { "cat{cat}", "cat" } };
- yield return new object[] { null, @"(cat){cat,5}?", "cat{cat,5}?", RegexOptions.None, new string[] { "cat{cat,5}", "cat" } };
- yield return new object[] { null, @"(cat){5,dog}?", "cat{5,dog}?", RegexOptions.None, new string[] { "cat{5,dog}", "cat" } };
- yield return new object[] { null, @"(cat){cat,dog}?", "cat{cat,dog}?", RegexOptions.None, new string[] { "cat{cat,dog}", "cat" } };
-
- // Atomic subexpressions
- // Implicitly upgrading (or not) oneloop to be atomic
- yield return new object[] { null, @"a*b", "aaab", RegexOptions.None, new string[] { "aaab" } };
- yield return new object[] { null, @"a*b+", "aaab", RegexOptions.None, new string[] { "aaab" } };
- yield return new object[] { null, @"a*b+?", "aaab", RegexOptions.None, new string[] { "aaab" } };
- yield return new object[] { null, @"a*(?>b+)", "aaab", RegexOptions.None, new string[] { "aaab" } };
- yield return new object[] { null, @"a*[^a]", "aaab", RegexOptions.None, new string[] { "aaab" } };
- yield return new object[] { null, @"a*[^a]+", "aaab", RegexOptions.None, new string[] { "aaab" } };
- yield return new object[] { null, @"a*[^a]+?", "aaab", RegexOptions.None, new string[] { "aaab" } };
- yield return new object[] { null, @"a*(?>[^a]+)", "aaab", RegexOptions.None, new string[] { "aaab" } };
- yield return new object[] { null, @"a*bcd", "aaabcd", RegexOptions.None, new string[] { "aaabcd" } };
- yield return new object[] { null, @"a*[bcd]", "aaab", RegexOptions.None, new string[] { "aaab" } };
- yield return new object[] { null, @"a*[bcd]+", "aaab", RegexOptions.None, new string[] { "aaab" } };
- yield return new object[] { null, @"a*[bcd]+?", "aaab", RegexOptions.None, new string[] { "aaab" } };
- yield return new object[] { null, @"a*(?>[bcd]+)", "aaab", RegexOptions.None, new string[] { "aaab" } };
- yield return new object[] { null, @"a*[bcd]{1,3}", "aaab", RegexOptions.None, new string[] { "aaab" } };
- yield return new object[] { null, @"a*([bcd]ab|[bef]cd){1,3}", "aaababecdcac", RegexOptions.ExplicitCapture, new string[] { "aaababecd" } };
- yield return new object[] { null, @"a*([bcd]|[aef]){1,3}", "befb", RegexOptions.ExplicitCapture, new string[] { "bef" } }; // can't upgrade
- yield return new object[] { null, @"a*$", "aaa", RegexOptions.None, new string[] { "aaa" } };
- yield return new object[] { null, @"a*$", "aaa", RegexOptions.Multiline, new string[] { "aaa" } };
- yield return new object[] { null, @"a*\b", "aaa bbb", RegexOptions.None, new string[] { "aaa" } };
- yield return new object[] { null, @"a*\b", "aaa bbb", RegexOptions.ECMAScript, new string[] { "aaa" } };
- yield return new object[] { null, @"@*\B", "@@@", RegexOptions.None, new string[] { "@@@" } };
- yield return new object[] { null, @"@*\B", "@@@", RegexOptions.ECMAScript, new string[] { "@@@" } };
- yield return new object[] { null, @"(?:abcd*|efgh)i", "efghi", RegexOptions.None, new string[] { "efghi" } };
- yield return new object[] { null, @"(?:abcd|efgh*)i", "efgi", RegexOptions.None, new string[] { "efgi" } };
- yield return new object[] { null, @"(?:abcd|efghj{2,}|j[klm]o+)i", "efghjjjjji", RegexOptions.None, new string[] { "efghjjjjji" } };
- yield return new object[] { null, @"(?:abcd|efghi{2,}|j[klm]o+)i", "efghiii", RegexOptions.None, new string[] { "efghiii" } };
- yield return new object[] { null, @"(?:abcd|efghi{2,}|j[klm]o+)i", "efghiiiiiiii", RegexOptions.None, new string[] { "efghiiiiiiii" } };
- yield return new object[] { null, @"a?ba?ba?ba?b", "abbabab", RegexOptions.None, new string[] { "abbabab" } };
- yield return new object[] { null, @"a?ba?ba?ba?b", "abBAbab", RegexOptions.IgnoreCase, new string[] { "abBAbab" } };
- // Implicitly upgrading (or not) notoneloop to be atomic
- yield return new object[] { null, @"[^b]*b", "aaab", RegexOptions.None, new string[] { "aaab" } };
- yield return new object[] { null, @"[^b]*b+", "aaab", RegexOptions.None, new string[] { "aaab" } };
- yield return new object[] { null, @"[^b]*b+?", "aaab", RegexOptions.None, new string[] { "aaab" } };
- yield return new object[] { null, @"[^b]*(?>b+)", "aaab", RegexOptions.None, new string[] { "aaab" } };
- yield return new object[] { null, @"[^b]*bac", "aaabac", RegexOptions.None, new string[] { "aaabac" } };
- yield return new object[] { null, @"[^b]*", "aaa", RegexOptions.None, new string[] { "aaa" } };
- yield return new object[] { null, @"(?:abc[^b]*|efgh)i", "efghi", RegexOptions.None, new string[] { "efghi" } }; // can't upgrade
- yield return new object[] { null, @"(?:abcd|efg[^b]*)b", "efgb", RegexOptions.None, new string[] { "efgb" } };
- yield return new object[] { null, @"(?:abcd|efg[^b]*)i", "efgi", RegexOptions.None, new string[] { "efgi" } }; // can't upgrade
- yield return new object[] { null, @"[^a]?a[^a]?a[^a]?a[^a]?a", "baababa", RegexOptions.None, new string[] { "baababa" } };
- yield return new object[] { null, @"[^a]?a[^a]?a[^a]?a[^a]?a", "BAababa", RegexOptions.IgnoreCase, new string[] { "BAababa" } };
- // Implicitly upgrading (or not) setloop to be atomic
- yield return new object[] { null, @"[ac]*", "aaa", RegexOptions.None, new string[] { "aaa" } };
- yield return new object[] { null, @"[ac]*b", "aaab", RegexOptions.None, new string[] { "aaab" } };
- yield return new object[] { null, @"[ac]*b+", "aaab", RegexOptions.None, new string[] { "aaab" } };
- yield return new object[] { null, @"[ac]*b+?", "aaab", RegexOptions.None, new string[] { "aaab" } };
- yield return new object[] { null, @"[ac]*(?>b+)", "aaab", RegexOptions.None, new string[] { "aaab" } };
- yield return new object[] { null, @"[ac]*[^a]", "aaab", RegexOptions.None, new string[] { "aaab" } };
- yield return new object[] { null, @"[ac]*[^a]+", "aaab", RegexOptions.None, new string[] { "aaab" } };
- yield return new object[] { null, @"[ac]*[^a]+?", "aaab", RegexOptions.None, new string[] { "aaab" } };
- yield return new object[] { null, @"[ac]*(?>[^a]+)", "aaab", RegexOptions.None, new string[] { "aaab" } };
- yield return new object[] { null, @"[ac]*bcd", "aaabcd", RegexOptions.None, new string[] { "aaabcd" } };
- yield return new object[] { null, @"[ac]*[bd]", "aaab", RegexOptions.None, new string[] { "aaab" } };
- yield return new object[] { null, @"[ac]*[bd]+", "aaab", RegexOptions.None, new string[] { "aaab" } };
- yield return new object[] { null, @"[ac]*[bd]+?", "aaab", RegexOptions.None, new string[] { "aaab" } };
- yield return new object[] { null, @"[ac]*(?>[bd]+)", "aaab", RegexOptions.None, new string[] { "aaab" } };
- yield return new object[] { null, @"[ac]*[bd]{1,3}", "aaab", RegexOptions.None, new string[] { "aaab" } };
- yield return new object[] { null, @"[ac]*$", "aaa", RegexOptions.None, new string[] { "aaa" } };
- yield return new object[] { null, @"[ac]*$", "aaa", RegexOptions.Multiline, new string[] { "aaa" } };
- yield return new object[] { null, @"[ac]*\b", "aaa bbb", RegexOptions.None, new string[] { "aaa" } };
- yield return new object[] { null, @"[ac]*\b", "aaa bbb", RegexOptions.ECMAScript, new string[] { "aaa" } };
- yield return new object[] { null, @"[@']*\B", "@@@", RegexOptions.None, new string[] { "@@@" } };
- yield return new object[] { null, @"[@']*\B", "@@@", RegexOptions.ECMAScript, new string[] { "@@@" } };
- yield return new object[] { null, @".*.", "@@@", RegexOptions.Singleline, new string[] { "@@@" } };
- yield return new object[] { null, @"(?:abcd|efg[hij]*)h", "efgh", RegexOptions.None, new string[] { "efgh" } }; // can't upgrade
- yield return new object[] { null, @"(?:abcd|efg[hij]*)ih", "efgjih", RegexOptions.None, new string[] { "efgjih" } }; // can't upgrade
- yield return new object[] { null, @"(?:abcd|efg[hij]*)k", "efgjk", RegexOptions.None, new string[] { "efgjk" } };
- yield return new object[] { null, @"[ace]?b[ace]?b[ace]?b[ace]?b", "cbbabeb", RegexOptions.None, new string[] { "cbbabeb" } };
- yield return new object[] { null, @"[ace]?b[ace]?b[ace]?b[ace]?b", "cBbAbEb", RegexOptions.IgnoreCase, new string[] { "cBbAbEb" } };
- yield return new object[] { null, @"a[^wz]*w", "abcdcdcdwz", RegexOptions.None, new string[] { "abcdcdcdw" } };
- yield return new object[] { null, @"a[^wyz]*w", "abcdcdcdwz", RegexOptions.None, new string[] { "abcdcdcdw" } };
- yield return new object[] { null, @"a[^wyz]*W", "abcdcdcdWz", RegexOptions.IgnoreCase, new string[] { "abcdcdcdW" } };
- // Implicitly upgrading (or not) concat loops to be atomic
- yield return new object[] { null, @"(?:[ab]c[de]f)*", "", RegexOptions.None, new string[] { "" } };
- yield return new object[] { null, @"(?:[ab]c[de]f)*", "acdf", RegexOptions.None, new string[] { "acdf" } };
- yield return new object[] { null, @"(?:[ab]c[de]f)*", "acdfbcef", RegexOptions.None, new string[] { "acdfbcef" } };
- yield return new object[] { null, @"(?:[ab]c[de]f)*", "cdfbcef", RegexOptions.None, new string[] { "" } };
- yield return new object[] { null, @"(?:[ab]c[de]f)+", "cdfbcef", RegexOptions.None, new string[] { "bcef" } };
- yield return new object[] { null, @"(?:[ab]c[de]f)*", "bcefbcdfacfe", RegexOptions.None, new string[] { "bcefbcdf" } };
- // Implicitly upgrading (or not) nested loops to be atomic
- yield return new object[] { null, @"(?:a){3}", "aaaaaaaaa", RegexOptions.None, new string[] { "aaa" } };
- yield return new object[] { null, @"(?:a){3}?", "aaaaaaaaa", RegexOptions.None, new string[] { "aaa" } };
- yield return new object[] { null, @"(?:a{2}){3}", "aaaaaaaaa", RegexOptions.None, new string[] { "aaaaaa" } };
- yield return new object[] { null, @"(?:a{2}?){3}?", "aaaaaaaaa", RegexOptions.None, new string[] { "aaaaaa" } };
- yield return new object[] { null, @"(?:(?:[ab]c[de]f){3}){2}", "acdfbcdfacefbcefbcefbcdfacdef", RegexOptions.None, new string[] { "acdfbcdfacefbcefbcefbcdf" } };
- yield return new object[] { null, @"(?:(?:[ab]c[de]f){3}hello){2}", "aaaaaacdfbcdfacefhellobcefbcefbcdfhellooooo", RegexOptions.None, new string[] { "acdfbcdfacefhellobcefbcefbcdfhello" } };
- yield return new object[] { null, @"CN=(.*[^,]+).*", "CN=localhost", RegexOptions.Singleline, new string[] { "CN=localhost", "localhost" } };
- // Nested atomic
- yield return new object[] { null, @"(?>abc[def]gh(i*))", "123abceghiii456", RegexOptions.None, new string[] { "abceghiii", "iii" } };
- yield return new object[] { null, @"(?>(?:abc)*)", "abcabcabc", RegexOptions.None, new string[] { "abcabcabc" } };
-
- // Anchoring loops beginning with .* / .+
- yield return new object[] { null, @".*", "", RegexOptions.None, new string[] { "" } };
- yield return new object[] { null, @".*", "\n\n\n\n", RegexOptions.None, new string[] { "" } };
- yield return new object[] { null, @".*", "\n\n\n\n", RegexOptions.Singleline, new string[] { "\n\n\n\n" } };
- yield return new object[] { null, @".*[1a]", "\n\n\n\n1", RegexOptions.None, new string[] { "1" } };
- yield return new object[] { null, @"(?s).*(?-s)[1a]", "1\n\n\n\n", RegexOptions.None, new string[] { "1" } };
- yield return new object[] { null, @"(?s).*(?-s)[1a]", "\n\n\n\n1", RegexOptions.None, new string[] { "\n\n\n\n1" } };
- yield return new object[] { null, @".*|.*|.*", "", RegexOptions.None, new string[] { "" } };
- yield return new object[] { null, @".*123|abc", "abc\n123", RegexOptions.None, new string[] { "abc" } };
- yield return new object[] { null, @".*123|abc", "abc\n123", RegexOptions.Singleline, new string[] { "abc\n123" }, "abc" }; // <-- Nonbacktracking match same as for "abc|.*123"
- yield return new object[] { null, @"abc|.*123", "abc\n123", RegexOptions.Singleline, new string[] { "abc" } };
- yield return new object[] { null, @".*", "\n", RegexOptions.None, new string[] { "" } };
- yield return new object[] { null, @".*\n", "\n", RegexOptions.None, new string[] { "\n" } };
- yield return new object[] { null, @".*", "\n", RegexOptions.Singleline, new string[] { "\n" } };
- yield return new object[] { null, @".*\n", "\n", RegexOptions.Singleline, new string[] { "\n" } };
- yield return new object[] { null, @".*", "abc", RegexOptions.None, new string[] { "abc" } };
- yield return new object[] { null, @".*abc", "abc", RegexOptions.None, new string[] { "abc" } };
- yield return new object[] { null, @".*abc|ghi", "ghi", RegexOptions.None, new string[] { "ghi" } };
- yield return new object[] { null, @".*abc|.*ghi", "abcghi", RegexOptions.None, new string[] { "abc" }, "abcghi" }; // <-- Nonbacktracking match same as for ".*ghi|.*abc"
- yield return new object[] { null, @".*ghi|.*abc", "abcghi", RegexOptions.None, new string[] { "abcghi" } };
- yield return new object[] { null, @".*abc|.*ghi", "bcghi", RegexOptions.None, new string[] { "bcghi" } };
- yield return new object[] { null, @".*abc|.+c", " \n \n bc", RegexOptions.None, new string[] { " bc" } };
- yield return new object[] { null, @".*abc", "12345 abc", RegexOptions.None, new string[] { "12345 abc" } };
- yield return new object[] { null, @".*abc", "12345\n abc", RegexOptions.None, new string[] { " abc" } };
- yield return new object[] { null, @".*abc", "12345\n abc", RegexOptions.Singleline, new string[] { "12345\n abc" } };
- yield return new object[] { null, @"(.*)abc\1", "\n12345abc12345", RegexOptions.Singleline, new string[] { "12345abc12345", "12345" } };
- yield return new object[] { null, @".*\nabc", "\n123\nabc", RegexOptions.None, new string[] { "123\nabc" } };
- yield return new object[] { null, @".*\nabc", "\n123\nabc", RegexOptions.Singleline, new string[] { "\n123\nabc" } };
- yield return new object[] { null, @".*abc", "abc abc abc \nabc", RegexOptions.None, new string[] { "abc abc abc" } };
- yield return new object[] { null, @".*abc", "abc abc abc \nabc", RegexOptions.Singleline, new string[] { "abc abc abc \nabc" } };
- yield return new object[] { null, @".*?abc", "abc abc abc \nabc", RegexOptions.None, new string[] { "abc" } };
- yield return new object[] { null, @"[^\n]*abc", "123abc\n456abc\n789abc", RegexOptions.None, new string[] { "123abc" } };
- yield return new object[] { null, @"[^\n]*abc", "123abc\n456abc\n789abc", RegexOptions.Singleline, new string[] { "123abc" } };
- yield return new object[] { null, @"[^\n]*abc", "123ab\n456abc\n789abc", RegexOptions.None, new string[] { "456abc" } };
- yield return new object[] { null, @"[^\n]*abc", "123ab\n456abc\n789abc", RegexOptions.Singleline, new string[] { "456abc" } };
- yield return new object[] { null, @".+", "a", RegexOptions.None, new string[] { "a" } };
- yield return new object[] { null, @".+", "\nabc", RegexOptions.None, new string[] { "abc" } };
- yield return new object[] { null, @".+", "\n", RegexOptions.Singleline, new string[] { "\n" } };
- yield return new object[] { null, @".+", "\nabc", RegexOptions.Singleline, new string[] { "\nabc" } };
- yield return new object[] { null, @".+abc", "aaaabc", RegexOptions.None, new string[] { "aaaabc" } };
- yield return new object[] { null, @".+abc", "12345 abc", RegexOptions.None, new string[] { "12345 abc" } };
- yield return new object[] { null, @".+abc", "12345\n abc", RegexOptions.None, new string[] { " abc" } };
- yield return new object[] { null, @".+abc", "12345\n abc", RegexOptions.Singleline, new string[] { "12345\n abc" } };
- yield return new object[] { null, @"(.+)abc\1", "\n12345abc12345", RegexOptions.Singleline, new string[] { "12345abc12345", "12345" } };
-
- // Unanchored .*
- yield return new object[] { null, @"\A\s*(?<name>\w+)(\s*\((?<arguments>.*)\))?\s*\Z", "Match(Name)", RegexOptions.None, new string[] { "Match(Name)", "(Name)", "Match", "Name" } };
- yield return new object[] { null, @"\A\s*(?<name>\w+)(\s*\((?<arguments>.*)\))?\s*\Z", "Match(Na\nme)", RegexOptions.Singleline, new string[] { "Match(Na\nme)", "(Na\nme)", "Match", "Na\nme" } };
- foreach (RegexOptions options in new[] { RegexOptions.None, RegexOptions.Singleline })
- {
- yield return new object[] { null, @"abcd.*", @"abcabcd", options, new string[] { "abcd" } };
- yield return new object[] { null, @"abcd.*", @"abcabcde", options, new string[] { "abcde" } };
- yield return new object[] { null, @"abcd.*", @"abcabcdefg", options, new string[] { "abcdefg" } };
- yield return new object[] { null, @"abcd(.*)", @"ababcd", options, new string[] { "abcd", "" } };
- yield return new object[] { null, @"abcd(.*)", @"aabcde", options, new string[] { "abcde", "e" } };
- yield return new object[] { null, @"abcd(.*)", @"abcabcdefg", options, new string[] { "abcdefg", "efg" } };
- yield return new object[] { null, @"abcd(.*)e", @"abcabcdefg", options, new string[] { "abcde", "" } };
- yield return new object[] { null, @"abcd(.*)f", @"abcabcdefg", options, new string[] { "abcdef", "e" } };
- }
+ yield return new object[] { engine, null, @"\G<%#(?<code>.*?)?%>", @"<%# DataBinder.Eval(this, ""MyNumber"") %>", RegexOptions.Singleline, new string[] { @"<%# DataBinder.Eval(this, ""MyNumber"") %>", @" DataBinder.Eval(this, ""MyNumber"") " } };
+
+ // Nested Quantifiers
+ yield return new object[] { engine, null, @"^[abcd]{0,0x10}*$", "a{0,0x10}}}", RegexOptions.None, new string[] { "a{0,0x10}}}" } };
+
+ // Lazy operator Backtracking
+ yield return new object[] { engine, null, @"http://([a-zA-z0-9\-]*\.?)*?(:[0-9]*)??/", "http://www.msn.com/", RegexOptions.IgnoreCase, new string[] { "http://www.msn.com/", "com", string.Empty } };
+ yield return new object[] { engine, null, @"http://([a-zA-Z0-9\-]*\.?)*?/", @"http://www.google.com/", RegexOptions.IgnoreCase, new string[] { "http://www.google.com/", "com" } };
+
+ yield return new object[] { engine, null, @"([a-z]*?)([\w])", "cat", RegexOptions.IgnoreCase, new string[] { "c", string.Empty, "c" } };
+ yield return new object[] { engine, null, @"^([a-z]*?)([\w])$", "cat", RegexOptions.IgnoreCase, new string[] { "cat", "ca", "t" } };
+
+ // Backtracking
+ yield return new object[] { engine, null, @"([a-z]*)([\w])", "cat", RegexOptions.IgnoreCase, new string[] { "cat", "ca", "t" } };
+ yield return new object[] { engine, null, @"^([a-z]*)([\w])$", "cat", RegexOptions.IgnoreCase, new string[] { "cat", "ca", "t" } };
+
+ // Backtracking with multiple (.*) groups -- important ASP.NET scenario
+ yield return new object[] { engine, null, @"(.*)/(.*).aspx", "/.aspx", RegexOptions.None, new string[] { "/.aspx", string.Empty, string.Empty } };
+ yield return new object[] { engine, null, @"(.*)/(.*).aspx", "/homepage.aspx", RegexOptions.None, new string[] { "/homepage.aspx", string.Empty, "homepage" } };
+ yield return new object[] { engine, null, @"(.*)/(.*).aspx", "pages/.aspx", RegexOptions.None, new string[] { "pages/.aspx", "pages", string.Empty } };
+ yield return new object[] { engine, null, @"(.*)/(.*).aspx", "pages/homepage.aspx", RegexOptions.None, new string[] { "pages/homepage.aspx", "pages", "homepage" } };
+ yield return new object[] { engine, null, @"(.*)/(.*).aspx", "/pages/homepage.aspx", RegexOptions.None, new string[] { "/pages/homepage.aspx", "/pages", "homepage" } };
+ yield return new object[] { engine, null, @"(.*)/(.*).aspx", "/pages/homepage/index.aspx", RegexOptions.None, new string[] { "/pages/homepage/index.aspx", "/pages/homepage", "index" } };
+ yield return new object[] { engine, null, @"(.*)/(.*).aspx", "/pages/homepage.aspx/index.aspx", RegexOptions.None, new string[] { "/pages/homepage.aspx/index.aspx", "/pages/homepage.aspx", "index" } };
+ yield return new object[] { engine, null, @"(.*)/(.*)/(.*).aspx", "/pages/homepage.aspx/index.aspx", RegexOptions.None, new string[] { "/pages/homepage.aspx/index.aspx", "/pages", "homepage.aspx", "index" } };
+
+ // Backtracking with multiple (.+) groups
+ yield return new object[] { engine, null, @"(.+)/(.+).aspx", "pages/homepage.aspx", RegexOptions.None, new string[] { "pages/homepage.aspx", "pages", "homepage" } };
+ yield return new object[] { engine, null, @"(.+)/(.+).aspx", "/pages/homepage.aspx", RegexOptions.None, new string[] { "/pages/homepage.aspx", "/pages", "homepage" } };
+ yield return new object[] { engine, null, @"(.+)/(.+).aspx", "/pages/homepage/index.aspx", RegexOptions.None, new string[] { "/pages/homepage/index.aspx", "/pages/homepage", "index" } };
+ yield return new object[] { engine, null, @"(.+)/(.+).aspx", "/pages/homepage.aspx/index.aspx", RegexOptions.None, new string[] { "/pages/homepage.aspx/index.aspx", "/pages/homepage.aspx", "index" } };
+ yield return new object[] { engine, null, @"(.+)/(.+)/(.+).aspx", "/pages/homepage.aspx/index.aspx", RegexOptions.None, new string[] { "/pages/homepage.aspx/index.aspx", "/pages", "homepage.aspx", "index" } };
+
+ // Backtracking with (.+) group followed by (.*)
+ yield return new object[] { engine, null, @"(.+)/(.*).aspx", "pages/.aspx", RegexOptions.None, new string[] { "pages/.aspx", "pages", string.Empty } };
+ yield return new object[] { engine, null, @"(.+)/(.*).aspx", "pages/homepage.aspx", RegexOptions.None, new string[] { "pages/homepage.aspx", "pages", "homepage" } };
+ yield return new object[] { engine, null, @"(.+)/(.*).aspx", "/pages/homepage.aspx", RegexOptions.None, new string[] { "/pages/homepage.aspx", "/pages", "homepage" } };
+ yield return new object[] { engine, null, @"(.+)/(.*).aspx", "/pages/homepage/index.aspx", RegexOptions.None, new string[] { "/pages/homepage/index.aspx", "/pages/homepage", "index" } };
+ yield return new object[] { engine, null, @"(.+)/(.*).aspx", "/pages/homepage.aspx/index.aspx", RegexOptions.None, new string[] { "/pages/homepage.aspx/index.aspx", "/pages/homepage.aspx", "index" } };
+ yield return new object[] { engine, null, @"(.+)/(.*)/(.*).aspx", "/pages/homepage.aspx/index.aspx", RegexOptions.None, new string[] { "/pages/homepage.aspx/index.aspx", "/pages", "homepage.aspx", "index" } };
+
+ // Backtracking with (.*) group followed by (.+)
+ yield return new object[] { engine, null, @"(.*)/(.+).aspx", "/homepage.aspx", RegexOptions.None, new string[] { "/homepage.aspx", string.Empty, "homepage" } };
+ yield return new object[] { engine, null, @"(.*)/(.+).aspx", "pages/homepage.aspx", RegexOptions.None, new string[] { "pages/homepage.aspx", "pages", "homepage" } };
+ yield return new object[] { engine, null, @"(.*)/(.+).aspx", "/pages/homepage.aspx", RegexOptions.None, new string[] { "/pages/homepage.aspx", "/pages", "homepage" } };
+ yield return new object[] { engine, null, @"(.*)/(.+).aspx", "/pages/homepage/index.aspx", RegexOptions.None, new string[] { "/pages/homepage/index.aspx", "/pages/homepage", "index" } };
+ yield return new object[] { engine, null, @"(.*)/(.+).aspx", "/pages/homepage.aspx/index.aspx", RegexOptions.None, new string[] { "/pages/homepage.aspx/index.aspx", "/pages/homepage.aspx", "index" } };
+ yield return new object[] { engine, null, @"(.*)/(.+)/(.+).aspx", "/pages/homepage.aspx/index.aspx", RegexOptions.None, new string[] { "/pages/homepage.aspx/index.aspx", "/pages", "homepage.aspx", "index" } };
+
+ // Quantifiers
+ yield return new object[] { engine, null, @"a*", "", RegexOptions.None, new string[] { "" } };
+ yield return new object[] { engine, null, @"a*", "a", RegexOptions.None, new string[] { "a" } };
+ yield return new object[] { engine, null, @"a*", "aa", RegexOptions.None, new string[] { "aa" } };
+ yield return new object[] { engine, null, @"a*", "aaa", RegexOptions.None, new string[] { "aaa" } };
+ yield return new object[] { engine, null, @"a*?", "", RegexOptions.None, new string[] { "" } };
+ yield return new object[] { engine, null, @"a*?", "a", RegexOptions.None, new string[] { "" } };
+ yield return new object[] { engine, null, @"a*?", "aa", RegexOptions.None, new string[] { "" } };
+ yield return new object[] { engine, null, @"a+?", "aa", RegexOptions.None, new string[] { "a" } };
+ yield return new object[] { engine, null, @"a{1,", "a{1,", RegexOptions.None, new string[] { "a{1," } };
+ yield return new object[] { engine, null, @"a{1,3}", "aaaaa", RegexOptions.None, new string[] { "aaa" } };
+ yield return new object[] { engine, null, @"a{1,3}?", "aaaaa", RegexOptions.None, new string[] { "a" } };
+ yield return new object[] { engine, null, @"a{2,2}", "aaaaa", RegexOptions.None, new string[] { "aa" } };
+ yield return new object[] { engine, null, @"a{2,2}?", "aaaaa", RegexOptions.None, new string[] { "aa" } };
+ yield return new object[] { engine, null, @".{1,3}", "bb\nba", RegexOptions.None, new string[] { "bb" } };
+ yield return new object[] { engine, null, @".{1,3}?", "bb\nba", RegexOptions.None, new string[] { "b" } };
+ yield return new object[] { engine, null, @".{2,2}", "bbb\nba", RegexOptions.None, new string[] { "bb" } };
+ yield return new object[] { engine, null, @".{2,2}?", "bbb\nba", RegexOptions.None, new string[] { "bb" } };
+ yield return new object[] { engine, null, @"[abc]{1,3}", "ccaba", RegexOptions.None, new string[] { "cca" } };
+ yield return new object[] { engine, null, @"[abc]{1,3}?", "ccaba", RegexOptions.None, new string[] { "c" } };
+ yield return new object[] { engine, null, @"[abc]{2,2}", "ccaba", RegexOptions.None, new string[] { "cc" } };
+ yield return new object[] { engine, null, @"[abc]{2,2}?", "ccaba", RegexOptions.None, new string[] { "cc" } };
+ yield return new object[] { engine, null, @"(?:[abc]def){1,3}xyz", "cdefxyz", RegexOptions.None, new string[] { "cdefxyz" } };
+ yield return new object[] { engine, null, @"(?:[abc]def){1,3}xyz", "adefbdefcdefxyz", RegexOptions.None, new string[] { "adefbdefcdefxyz" } };
+ yield return new object[] { engine, null, @"(?:[abc]def){1,3}?xyz", "cdefxyz", RegexOptions.None, new string[] { "cdefxyz" } };
+ yield return new object[] { engine, null, @"(?:[abc]def){1,3}?xyz", "adefbdefcdefxyz", RegexOptions.None, new string[] { "adefbdefcdefxyz" } };
+ yield return new object[] { engine, null, @"(?:[abc]def){2,2}xyz", "adefbdefcdefxyz", RegexOptions.None, new string[] { "bdefcdefxyz" } };
+ yield return new object[] { engine, null, @"(?:[abc]def){2,2}?xyz", "adefbdefcdefxyz", RegexOptions.None, new string[] { "bdefcdefxyz" } };
+ foreach (string prefix in new[] { "", "xyz" })
+ {
+ yield return new object[] { engine, null, prefix + @"(?:[abc]def){1,3}", prefix + "cdef", RegexOptions.None, new string[] { prefix + "cdef" } };
+ yield return new object[] { engine, null, prefix + @"(?:[abc]def){1,3}", prefix + "cdefadefbdef", RegexOptions.None, new string[] { prefix + "cdefadefbdef" } };
+ yield return new object[] { engine, null, prefix + @"(?:[abc]def){1,3}", prefix + "cdefadefbdefadef", RegexOptions.None, new string[] { prefix + "cdefadefbdef" } };
+ yield return new object[] { engine, null, prefix + @"(?:[abc]def){1,3}?", prefix + "cdef", RegexOptions.None, new string[] { prefix + "cdef" } };
+ yield return new object[] { engine, null, prefix + @"(?:[abc]def){1,3}?", prefix + "cdefadefbdef", RegexOptions.None, new string[] { prefix + "cdef" } };
+ yield return new object[] { engine, null, prefix + @"(?:[abc]def){2,2}", prefix + "cdefadefbdefadef", RegexOptions.None, new string[] { prefix + "cdefadef" } };
+ yield return new object[] { engine, null, prefix + @"(?:[abc]def){2,2}?", prefix + "cdefadefbdefadef", RegexOptions.None, new string[] { prefix + "cdefadef" } };
+ }
+ yield return new object[] { engine, null, @"(cat){", "cat{", RegexOptions.None, new string[] { "cat{", "cat" } };
+ yield return new object[] { engine, null, @"(cat){}", "cat{}", RegexOptions.None, new string[] { "cat{}", "cat" } };
+ yield return new object[] { engine, null, @"(cat){,", "cat{,", RegexOptions.None, new string[] { "cat{,", "cat" } };
+ yield return new object[] { engine, null, @"(cat){,}", "cat{,}", RegexOptions.None, new string[] { "cat{,}", "cat" } };
+ yield return new object[] { engine, null, @"(cat){cat}", "cat{cat}", RegexOptions.None, new string[] { "cat{cat}", "cat" } };
+ yield return new object[] { engine, null, @"(cat){cat,5}", "cat{cat,5}", RegexOptions.None, new string[] { "cat{cat,5}", "cat" } };
+ yield return new object[] { engine, null, @"(cat){5,dog}", "cat{5,dog}", RegexOptions.None, new string[] { "cat{5,dog}", "cat" } };
+ yield return new object[] { engine, null, @"(cat){cat,dog}", "cat{cat,dog}", RegexOptions.None, new string[] { "cat{cat,dog}", "cat" } };
+ yield return new object[] { engine, null, @"(cat){,}?", "cat{,}?", RegexOptions.None, new string[] { "cat{,}", "cat" } };
+ yield return new object[] { engine, null, @"(cat){cat}?", "cat{cat}?", RegexOptions.None, new string[] { "cat{cat}", "cat" } };
+ yield return new object[] { engine, null, @"(cat){cat,5}?", "cat{cat,5}?", RegexOptions.None, new string[] { "cat{cat,5}", "cat" } };
+ yield return new object[] { engine, null, @"(cat){5,dog}?", "cat{5,dog}?", RegexOptions.None, new string[] { "cat{5,dog}", "cat" } };
+ yield return new object[] { engine, null, @"(cat){cat,dog}?", "cat{cat,dog}?", RegexOptions.None, new string[] { "cat{cat,dog}", "cat" } };
+
+ // Atomic subexpressions
+ // Implicitly upgrading (or not) oneloop to be atomic
+ yield return new object[] { engine, null, @"a*b", "aaab", RegexOptions.None, new string[] { "aaab" } };
+ yield return new object[] { engine, null, @"a*b+", "aaab", RegexOptions.None, new string[] { "aaab" } };
+ yield return new object[] { engine, null, @"a*b+?", "aaab", RegexOptions.None, new string[] { "aaab" } };
+ yield return new object[] { engine, null, @"a*(?>b+)", "aaab", RegexOptions.None, new string[] { "aaab" } };
+ yield return new object[] { engine, null, @"a*[^a]", "aaab", RegexOptions.None, new string[] { "aaab" } };
+ yield return new object[] { engine, null, @"a*[^a]+", "aaab", RegexOptions.None, new string[] { "aaab" } };
+ yield return new object[] { engine, null, @"a*[^a]+?", "aaab", RegexOptions.None, new string[] { "aaab" } };
+ yield return new object[] { engine, null, @"a*(?>[^a]+)", "aaab", RegexOptions.None, new string[] { "aaab" } };
+ yield return new object[] { engine, null, @"a*bcd", "aaabcd", RegexOptions.None, new string[] { "aaabcd" } };
+ yield return new object[] { engine, null, @"a*[bcd]", "aaab", RegexOptions.None, new string[] { "aaab" } };
+ yield return new object[] { engine, null, @"a*[bcd]+", "aaab", RegexOptions.None, new string[] { "aaab" } };
+ yield return new object[] { engine, null, @"a*[bcd]+?", "aaab", RegexOptions.None, new string[] { "aaab" } };
+ yield return new object[] { engine, null, @"a*(?>[bcd]+)", "aaab", RegexOptions.None, new string[] { "aaab" } };
+ yield return new object[] { engine, null, @"a*[bcd]{1,3}", "aaab", RegexOptions.None, new string[] { "aaab" } };
+ yield return new object[] { engine, null, @"a*([bcd]ab|[bef]cd){1,3}", "aaababecdcac", RegexOptions.ExplicitCapture, new string[] { "aaababecd" } };
+ yield return new object[] { engine, null, @"a*([bcd]|[aef]){1,3}", "befb", RegexOptions.ExplicitCapture, new string[] { "bef" } }; // can't upgrade
+ yield return new object[] { engine, null, @"a*$", "aaa", RegexOptions.None, new string[] { "aaa" } };
+ yield return new object[] { engine, null, @"a*$", "aaa", RegexOptions.Multiline, new string[] { "aaa" } };
+ yield return new object[] { engine, null, @"a*\b", "aaa bbb", RegexOptions.None, new string[] { "aaa" } };
+ yield return new object[] { engine, null, @"a*\b", "aaa bbb", RegexOptions.ECMAScript, new string[] { "aaa" } };
+ yield return new object[] { engine, null, @"@*\B", "@@@", RegexOptions.None, new string[] { "@@@" } };
+ yield return new object[] { engine, null, @"@*\B", "@@@", RegexOptions.ECMAScript, new string[] { "@@@" } };
+ yield return new object[] { engine, null, @"(?:abcd*|efgh)i", "efghi", RegexOptions.None, new string[] { "efghi" } };
+ yield return new object[] { engine, null, @"(?:abcd|efgh*)i", "efgi", RegexOptions.None, new string[] { "efgi" } };
+ yield return new object[] { engine, null, @"(?:abcd|efghj{2,}|j[klm]o+)i", "efghjjjjji", RegexOptions.None, new string[] { "efghjjjjji" } };
+ yield return new object[] { engine, null, @"(?:abcd|efghi{2,}|j[klm]o+)i", "efghiii", RegexOptions.None, new string[] { "efghiii" } };
+ yield return new object[] { engine, null, @"(?:abcd|efghi{2,}|j[klm]o+)i", "efghiiiiiiii", RegexOptions.None, new string[] { "efghiiiiiiii" } };
+ yield return new object[] { engine, null, @"a?ba?ba?ba?b", "abbabab", RegexOptions.None, new string[] { "abbabab" } };
+ yield return new object[] { engine, null, @"a?ba?ba?ba?b", "abBAbab", RegexOptions.IgnoreCase, new string[] { "abBAbab" } };
+ // Implicitly upgrading (or not) notoneloop to be atomic
+ yield return new object[] { engine, null, @"[^b]*b", "aaab", RegexOptions.None, new string[] { "aaab" } };
+ yield return new object[] { engine, null, @"[^b]*b+", "aaab", RegexOptions.None, new string[] { "aaab" } };
+ yield return new object[] { engine, null, @"[^b]*b+?", "aaab", RegexOptions.None, new string[] { "aaab" } };
+ yield return new object[] { engine, null, @"[^b]*(?>b+)", "aaab", RegexOptions.None, new string[] { "aaab" } };
+ yield return new object[] { engine, null, @"[^b]*bac", "aaabac", RegexOptions.None, new string[] { "aaabac" } };
+ yield return new object[] { engine, null, @"[^b]*", "aaa", RegexOptions.None, new string[] { "aaa" } };
+ yield return new object[] { engine, null, @"(?:abc[^b]*|efgh)i", "efghi", RegexOptions.None, new string[] { "efghi" } }; // can't upgrade
+ yield return new object[] { engine, null, @"(?:abcd|efg[^b]*)b", "efgb", RegexOptions.None, new string[] { "efgb" } };
+ yield return new object[] { engine, null, @"(?:abcd|efg[^b]*)i", "efgi", RegexOptions.None, new string[] { "efgi" } }; // can't upgrade
+ yield return new object[] { engine, null, @"[^a]?a[^a]?a[^a]?a[^a]?a", "baababa", RegexOptions.None, new string[] { "baababa" } };
+ yield return new object[] { engine, null, @"[^a]?a[^a]?a[^a]?a[^a]?a", "BAababa", RegexOptions.IgnoreCase, new string[] { "BAababa" } };
+ // Implicitly upgrading (or not) setloop to be atomic
+ yield return new object[] { engine, null, @"[ac]*", "aaa", RegexOptions.None, new string[] { "aaa" } };
+ yield return new object[] { engine, null, @"[ac]*b", "aaab", RegexOptions.None, new string[] { "aaab" } };
+ yield return new object[] { engine, null, @"[ac]*b+", "aaab", RegexOptions.None, new string[] { "aaab" } };
+ yield return new object[] { engine, null, @"[ac]*b+?", "aaab", RegexOptions.None, new string[] { "aaab" } };
+ yield return new object[] { engine, null, @"[ac]*(?>b+)", "aaab", RegexOptions.None, new string[] { "aaab" } };
+ yield return new object[] { engine, null, @"[ac]*[^a]", "aaab", RegexOptions.None, new string[] { "aaab" } };
+ yield return new object[] { engine, null, @"[ac]*[^a]+", "aaab", RegexOptions.None, new string[] { "aaab" } };
+ yield return new object[] { engine, null, @"[ac]*[^a]+?", "aaab", RegexOptions.None, new string[] { "aaab" } };
+ yield return new object[] { engine, null, @"[ac]*(?>[^a]+)", "aaab", RegexOptions.None, new string[] { "aaab" } };
+ yield return new object[] { engine, null, @"[ac]*bcd", "aaabcd", RegexOptions.None, new string[] { "aaabcd" } };
+ yield return new object[] { engine, null, @"[ac]*[bd]", "aaab", RegexOptions.None, new string[] { "aaab" } };
+ yield return new object[] { engine, null, @"[ac]*[bd]+", "aaab", RegexOptions.None, new string[] { "aaab" } };
+ yield return new object[] { engine, null, @"[ac]*[bd]+?", "aaab", RegexOptions.None, new string[] { "aaab" } };
+ yield return new object[] { engine, null, @"[ac]*(?>[bd]+)", "aaab", RegexOptions.None, new string[] { "aaab" } };
+ yield return new object[] { engine, null, @"[ac]*[bd]{1,3}", "aaab", RegexOptions.None, new string[] { "aaab" } };
+ yield return new object[] { engine, null, @"[ac]*$", "aaa", RegexOptions.None, new string[] { "aaa" } };
+ yield return new object[] { engine, null, @"[ac]*$", "aaa", RegexOptions.Multiline, new string[] { "aaa" } };
+ yield return new object[] { engine, null, @"[ac]*\b", "aaa bbb", RegexOptions.None, new string[] { "aaa" } };
+ yield return new object[] { engine, null, @"[ac]*\b", "aaa bbb", RegexOptions.ECMAScript, new string[] { "aaa" } };
+ yield return new object[] { engine, null, @"[@']*\B", "@@@", RegexOptions.None, new string[] { "@@@" } };
+ yield return new object[] { engine, null, @"[@']*\B", "@@@", RegexOptions.ECMAScript, new string[] { "@@@" } };
+ yield return new object[] { engine, null, @".*.", "@@@", RegexOptions.Singleline, new string[] { "@@@" } };
+ yield return new object[] { engine, null, @"(?:abcd|efg[hij]*)h", "efgh", RegexOptions.None, new string[] { "efgh" } }; // can't upgrade
+ yield return new object[] { engine, null, @"(?:abcd|efg[hij]*)ih", "efgjih", RegexOptions.None, new string[] { "efgjih" } }; // can't upgrade
+ yield return new object[] { engine, null, @"(?:abcd|efg[hij]*)k", "efgjk", RegexOptions.None, new string[] { "efgjk" } };
+ yield return new object[] { engine, null, @"[ace]?b[ace]?b[ace]?b[ace]?b", "cbbabeb", RegexOptions.None, new string[] { "cbbabeb" } };
+ yield return new object[] { engine, null, @"[ace]?b[ace]?b[ace]?b[ace]?b", "cBbAbEb", RegexOptions.IgnoreCase, new string[] { "cBbAbEb" } };
+ yield return new object[] { engine, null, @"a[^wz]*w", "abcdcdcdwz", RegexOptions.None, new string[] { "abcdcdcdw" } };
+ yield return new object[] { engine, null, @"a[^wyz]*w", "abcdcdcdwz", RegexOptions.None, new string[] { "abcdcdcdw" } };
+ yield return new object[] { engine, null, @"a[^wyz]*W", "abcdcdcdWz", RegexOptions.IgnoreCase, new string[] { "abcdcdcdW" } };
+ // Implicitly upgrading (or not) concat loops to be atomic
+ yield return new object[] { engine, null, @"(?:[ab]c[de]f)*", "", RegexOptions.None, new string[] { "" } };
+ yield return new object[] { engine, null, @"(?:[ab]c[de]f)*", "acdf", RegexOptions.None, new string[] { "acdf" } };
+ yield return new object[] { engine, null, @"(?:[ab]c[de]f)*", "acdfbcef", RegexOptions.None, new string[] { "acdfbcef" } };
+ yield return new object[] { engine, null, @"(?:[ab]c[de]f)*", "cdfbcef", RegexOptions.None, new string[] { "" } };
+ yield return new object[] { engine, null, @"(?:[ab]c[de]f)+", "cdfbcef", RegexOptions.None, new string[] { "bcef" } };
+ yield return new object[] { engine, null, @"(?:[ab]c[de]f)*", "bcefbcdfacfe", RegexOptions.None, new string[] { "bcefbcdf" } };
+ // Implicitly upgrading (or not) nested loops to be atomic
+ yield return new object[] { engine, null, @"(?:a){3}", "aaaaaaaaa", RegexOptions.None, new string[] { "aaa" } };
+ yield return new object[] { engine, null, @"(?:a){3}?", "aaaaaaaaa", RegexOptions.None, new string[] { "aaa" } };
+ yield return new object[] { engine, null, @"(?:a{2}){3}", "aaaaaaaaa", RegexOptions.None, new string[] { "aaaaaa" } };
+ yield return new object[] { engine, null, @"(?:a{2}?){3}?", "aaaaaaaaa", RegexOptions.None, new string[] { "aaaaaa" } };
+ yield return new object[] { engine, null, @"(?:(?:[ab]c[de]f){3}){2}", "acdfbcdfacefbcefbcefbcdfacdef", RegexOptions.None, new string[] { "acdfbcdfacefbcefbcefbcdf" } };
+ yield return new object[] { engine, null, @"(?:(?:[ab]c[de]f){3}hello){2}", "aaaaaacdfbcdfacefhellobcefbcefbcdfhellooooo", RegexOptions.None, new string[] { "acdfbcdfacefhellobcefbcefbcdfhello" } };
+ yield return new object[] { engine, null, @"CN=(.*[^,]+).*", "CN=localhost", RegexOptions.Singleline, new string[] { "CN=localhost", "localhost" } };
+ // Nested atomic
+ yield return new object[] { engine, null, @"(?>abc[def]gh(i*))", "123abceghiii456", RegexOptions.None, new string[] { "abceghiii", "iii" } };
+ yield return new object[] { engine, null, @"(?>(?:abc)*)", "abcabcabc", RegexOptions.None, new string[] { "abcabcabc" } };
+
+ // Anchoring loops beginning with .* / .+
+ yield return new object[] { engine, null, @".*", "", RegexOptions.None, new string[] { "" } };
+ yield return new object[] { engine, null, @".*", "\n\n\n\n", RegexOptions.None, new string[] { "" } };
+ yield return new object[] { engine, null, @".*", "\n\n\n\n", RegexOptions.Singleline, new string[] { "\n\n\n\n" } };
+ yield return new object[] { engine, null, @".*[1a]", "\n\n\n\n1", RegexOptions.None, new string[] { "1" } };
+ yield return new object[] { engine, null, @"(?s).*(?-s)[1a]", "1\n\n\n\n", RegexOptions.None, new string[] { "1" } };
+ yield return new object[] { engine, null, @"(?s).*(?-s)[1a]", "\n\n\n\n1", RegexOptions.None, new string[] { "\n\n\n\n1" } };
+ yield return new object[] { engine, null, @".*|.*|.*", "", RegexOptions.None, new string[] { "" } };
+ yield return new object[] { engine, null, @".*123|abc", "abc\n123", RegexOptions.None, new string[] { "abc" } };
+ yield return new object[] { engine, null, @".*123|abc", "abc\n123", RegexOptions.Singleline, new string[] { "abc\n123" }, "abc" }; // <-- Nonbacktracking match same as for "abc|.*123"
+ yield return new object[] { engine, null, @"abc|.*123", "abc\n123", RegexOptions.Singleline, new string[] { "abc" } };
+ yield return new object[] { engine, null, @".*", "\n", RegexOptions.None, new string[] { "" } };
+ yield return new object[] { engine, null, @".*\n", "\n", RegexOptions.None, new string[] { "\n" } };
+ yield return new object[] { engine, null, @".*", "\n", RegexOptions.Singleline, new string[] { "\n" } };
+ yield return new object[] { engine, null, @".*\n", "\n", RegexOptions.Singleline, new string[] { "\n" } };
+ yield return new object[] { engine, null, @".*", "abc", RegexOptions.None, new string[] { "abc" } };
+ yield return new object[] { engine, null, @".*abc", "abc", RegexOptions.None, new string[] { "abc" } };
+ yield return new object[] { engine, null, @".*abc|ghi", "ghi", RegexOptions.None, new string[] { "ghi" } };
+ yield return new object[] { engine, null, @".*abc|.*ghi", "abcghi", RegexOptions.None, new string[] { "abc" }, "abcghi" }; // <-- Nonbacktracking match same as for ".*ghi|.*abc"
+ yield return new object[] { engine, null, @".*ghi|.*abc", "abcghi", RegexOptions.None, new string[] { "abcghi" } };
+ yield return new object[] { engine, null, @".*abc|.*ghi", "bcghi", RegexOptions.None, new string[] { "bcghi" } };
+ yield return new object[] { engine, null, @".*abc|.+c", " \n \n bc", RegexOptions.None, new string[] { " bc" } };
+ yield return new object[] { engine, null, @".*abc", "12345 abc", RegexOptions.None, new string[] { "12345 abc" } };
+ yield return new object[] { engine, null, @".*abc", "12345\n abc", RegexOptions.None, new string[] { " abc" } };
+ yield return new object[] { engine, null, @".*abc", "12345\n abc", RegexOptions.Singleline, new string[] { "12345\n abc" } };
+ yield return new object[] { engine, null, @"(.*)abc\1", "\n12345abc12345", RegexOptions.Singleline, new string[] { "12345abc12345", "12345" } };
+ yield return new object[] { engine, null, @".*\nabc", "\n123\nabc", RegexOptions.None, new string[] { "123\nabc" } };
+ yield return new object[] { engine, null, @".*\nabc", "\n123\nabc", RegexOptions.Singleline, new string[] { "\n123\nabc" } };
+ yield return new object[] { engine, null, @".*abc", "abc abc abc \nabc", RegexOptions.None, new string[] { "abc abc abc" } };
+ yield return new object[] { engine, null, @".*abc", "abc abc abc \nabc", RegexOptions.Singleline, new string[] { "abc abc abc \nabc" } };
+ yield return new object[] { engine, null, @".*?abc", "abc abc abc \nabc", RegexOptions.None, new string[] { "abc" } };
+ yield return new object[] { engine, null, @"[^\n]*abc", "123abc\n456abc\n789abc", RegexOptions.None, new string[] { "123abc" } };
+ yield return new object[] { engine, null, @"[^\n]*abc", "123abc\n456abc\n789abc", RegexOptions.Singleline, new string[] { "123abc" } };
+ yield return new object[] { engine, null, @"[^\n]*abc", "123ab\n456abc\n789abc", RegexOptions.None, new string[] { "456abc" } };
+ yield return new object[] { engine, null, @"[^\n]*abc", "123ab\n456abc\n789abc", RegexOptions.Singleline, new string[] { "456abc" } };
+ yield return new object[] { engine, null, @".+", "a", RegexOptions.None, new string[] { "a" } };
+ yield return new object[] { engine, null, @".+", "\nabc", RegexOptions.None, new string[] { "abc" } };
+ yield return new object[] { engine, null, @".+", "\n", RegexOptions.Singleline, new string[] { "\n" } };
+ yield return new object[] { engine, null, @".+", "\nabc", RegexOptions.Singleline, new string[] { "\nabc" } };
+ yield return new object[] { engine, null, @".+abc", "aaaabc", RegexOptions.None, new string[] { "aaaabc" } };
+ yield return new object[] { engine, null, @".+abc", "12345 abc", RegexOptions.None, new string[] { "12345 abc" } };
+ yield return new object[] { engine, null, @".+abc", "12345\n abc", RegexOptions.None, new string[] { " abc" } };
+ yield return new object[] { engine, null, @".+abc", "12345\n abc", RegexOptions.Singleline, new string[] { "12345\n abc" } };
+ yield return new object[] { engine, null, @"(.+)abc\1", "\n12345abc12345", RegexOptions.Singleline, new string[] { "12345abc12345", "12345" } };
+
+ // Unanchored .*
+ yield return new object[] { engine, null, @"\A\s*(?<name>\w+)(\s*\((?<arguments>.*)\))?\s*\Z", "Match(Name)", RegexOptions.None, new string[] { "Match(Name)", "(Name)", "Match", "Name" } };
+ yield return new object[] { engine, null, @"\A\s*(?<name>\w+)(\s*\((?<arguments>.*)\))?\s*\Z", "Match(Na\nme)", RegexOptions.Singleline, new string[] { "Match(Na\nme)", "(Na\nme)", "Match", "Na\nme" } };
+ foreach (RegexOptions options in new[] { RegexOptions.None, RegexOptions.Singleline })
+ {
+ yield return new object[] { engine, null, @"abcd.*", @"abcabcd", options, new string[] { "abcd" } };
+ yield return new object[] { engine, null, @"abcd.*", @"abcabcde", options, new string[] { "abcde" } };
+ yield return new object[] { engine, null, @"abcd.*", @"abcabcdefg", options, new string[] { "abcdefg" } };
+ yield return new object[] { engine, null, @"abcd(.*)", @"ababcd", options, new string[] { "abcd", "" } };
+ yield return new object[] { engine, null, @"abcd(.*)", @"aabcde", options, new string[] { "abcde", "e" } };
+ yield return new object[] { engine, null, @"abcd(.*)", @"abcabcdefg", options, new string[] { "abcdefg", "efg" } };
+ yield return new object[] { engine, null, @"abcd(.*)e", @"abcabcdefg", options, new string[] { "abcde", "" } };
+ yield return new object[] { engine, null, @"abcd(.*)f", @"abcabcdefg", options, new string[] { "abcdef", "e" } };
+ }
- // Grouping Constructs Invalid Regular Expressions
- yield return new object[] { null, @"()", "cat", RegexOptions.None, new string[] { string.Empty, string.Empty } };
- yield return new object[] { null, @"(?<cat>)", "cat", RegexOptions.None, new string[] { string.Empty, string.Empty } };
- yield return new object[] { null, @"(?'cat')", "cat", RegexOptions.None, new string[] { string.Empty, string.Empty } };
- yield return new object[] { null, @"(?:)", "cat", RegexOptions.None, new string[] { string.Empty } };
- yield return new object[] { null, @"(?imn)", "cat", RegexOptions.None, new string[] { string.Empty } };
- yield return new object[] { null, @"(?imn)cat", "(?imn)cat", RegexOptions.None, new string[] { "cat" } };
- yield return new object[] { null, @"(?=)", "cat", RegexOptions.None, new string[] { string.Empty } };
- yield return new object[] { null, @"(?<=)", "cat", RegexOptions.None, new string[] { string.Empty } };
- yield return new object[] { null, @"(?>)", "cat", RegexOptions.None, new string[] { string.Empty } };
-
- // Alternation construct Invalid Regular Expressions
- yield return new object[] { null, @"(?()|)", "(?()|)", RegexOptions.None, new string[] { "" } };
-
- yield return new object[] { null, @"(?(cat)|)", "cat", RegexOptions.None, new string[] { "" } };
- yield return new object[] { null, @"(?(cat)|)", "dog", RegexOptions.None, new string[] { "" } };
-
- yield return new object[] { null, @"(?(cat)catdog|)", "catdog", RegexOptions.None, new string[] { "catdog" } };
- yield return new object[] { null, @"(?(cat)catdog|)", "dog", RegexOptions.None, new string[] { "" } };
- yield return new object[] { null, @"(?(cat)dog|)", "dog", RegexOptions.None, new string[] { "" } };
- yield return new object[] { null, @"(?(cat)dog|)", "cat", RegexOptions.None, new string[] { "" } };
-
- yield return new object[] { null, @"(?(cat)|catdog)", "cat", RegexOptions.None, new string[] { "" } };
- yield return new object[] { null, @"(?(cat)|catdog)", "catdog", RegexOptions.None, new string[] { "" } };
- yield return new object[] { null, @"(?(cat)|dog)", "dog", RegexOptions.None, new string[] { "dog" } };
-
- // Invalid unicode
- yield return new object[] { null, "([\u0000-\uFFFF-[azAZ09]]|[\u0000-\uFFFF-[^azAZ09]])+", "azAZBCDE1234567890BCDEFAZza", RegexOptions.None, new string[] { "azAZBCDE1234567890BCDEFAZza", "a" } };
- yield return new object[] { null, "[\u0000-\uFFFF-[\u0000-\uFFFF-[\u0000-\uFFFF-[\u0000-\uFFFF-[\u0000-\uFFFF-[a]]]]]]+", "abcxyzABCXYZ123890", RegexOptions.None, new string[] { "bcxyzABCXYZ123890" } };
- yield return new object[] { null, "[\u0000-\uFFFF-[\u0000-\uFFFF-[\u0000-\uFFFF-[\u0000-\uFFFF-[\u0000-\uFFFF-[\u0000-\uFFFF-[a]]]]]]]+", "bcxyzABCXYZ123890a", RegexOptions.None, new string[] { "a" } };
- yield return new object[] { null, "[\u0000-\uFFFF-[\\p{P}\\p{S}\\p{C}]]+", "!@`';.,$+<>=\x0001\x001FazAZ09", RegexOptions.None, new string[] { "azAZ09" } };
-
- yield return new object[] { null, @"[\uFFFD-\uFFFF]+", "\uFFFC\uFFFD\uFFFE\uFFFF", RegexOptions.IgnoreCase, new string[] { "\uFFFD\uFFFE\uFFFF" } };
- yield return new object[] { null, @"[\uFFFC-\uFFFE]+", "\uFFFB\uFFFC\uFFFD\uFFFE\uFFFF", RegexOptions.IgnoreCase, new string[] { "\uFFFC\uFFFD\uFFFE" } };
-
- // Empty Match
- yield return new object[] { null, @"([a*]*)+?$", "ab", RegexOptions.None, new string[] { "", "" } };
- yield return new object[] { null, @"(a*)+?$", "b", RegexOptions.None, new string[] { "", "" } };
+ // Grouping Constructs Invalid Regular Expressions
+ yield return new object[] { engine, null, @"()", "cat", RegexOptions.None, new string[] { string.Empty, string.Empty } };
+ yield return new object[] { engine, null, @"(?<cat>)", "cat", RegexOptions.None, new string[] { string.Empty, string.Empty } };
+ yield return new object[] { engine, null, @"(?'cat')", "cat", RegexOptions.None, new string[] { string.Empty, string.Empty } };
+ yield return new object[] { engine, null, @"(?:)", "cat", RegexOptions.None, new string[] { string.Empty } };
+ yield return new object[] { engine, null, @"(?imn)", "cat", RegexOptions.None, new string[] { string.Empty } };
+ yield return new object[] { engine, null, @"(?imn)cat", "(?imn)cat", RegexOptions.None, new string[] { "cat" } };
+ yield return new object[] { engine, null, @"(?=)", "cat", RegexOptions.None, new string[] { string.Empty } };
+ yield return new object[] { engine, null, @"(?<=)", "cat", RegexOptions.None, new string[] { string.Empty } };
+ yield return new object[] { engine, null, @"(?>)", "cat", RegexOptions.None, new string[] { string.Empty } };
+
+ // Alternation construct Invalid Regular Expressions
+ yield return new object[] { engine, null, @"(?()|)", "(?()|)", RegexOptions.None, new string[] { "" } };
+
+ yield return new object[] { engine, null, @"(?(cat)|)", "cat", RegexOptions.None, new string[] { "" } };
+ yield return new object[] { engine, null, @"(?(cat)|)", "dog", RegexOptions.None, new string[] { "" } };
+
+ yield return new object[] { engine, null, @"(?(cat)catdog|)", "catdog", RegexOptions.None, new string[] { "catdog" } };
+ yield return new object[] { engine, null, @"(?(cat)catdog|)", "dog", RegexOptions.None, new string[] { "" } };
+ yield return new object[] { engine, null, @"(?(cat)dog|)", "dog", RegexOptions.None, new string[] { "" } };
+ yield return new object[] { engine, null, @"(?(cat)dog|)", "cat", RegexOptions.None, new string[] { "" } };
+
+ yield return new object[] { engine, null, @"(?(cat)|catdog)", "cat", RegexOptions.None, new string[] { "" } };
+ yield return new object[] { engine, null, @"(?(cat)|catdog)", "catdog", RegexOptions.None, new string[] { "" } };
+ yield return new object[] { engine, null, @"(?(cat)|dog)", "dog", RegexOptions.None, new string[] { "dog" } };
+
+ // Invalid unicode
+ yield return new object[] { engine, null, "([\u0000-\uFFFF-[azAZ09]]|[\u0000-\uFFFF-[^azAZ09]])+", "azAZBCDE1234567890BCDEFAZza", RegexOptions.None, new string[] { "azAZBCDE1234567890BCDEFAZza", "a" } };
+ yield return new object[] { engine, null, "[\u0000-\uFFFF-[\u0000-\uFFFF-[\u0000-\uFFFF-[\u0000-\uFFFF-[\u0000-\uFFFF-[a]]]]]]+", "abcxyzABCXYZ123890", RegexOptions.None, new string[] { "bcxyzABCXYZ123890" } };
+ yield return new object[] { engine, null, "[\u0000-\uFFFF-[\u0000-\uFFFF-[\u0000-\uFFFF-[\u0000-\uFFFF-[\u0000-\uFFFF-[\u0000-\uFFFF-[a]]]]]]]+", "bcxyzABCXYZ123890a", RegexOptions.None, new string[] { "a" } };
+ yield return new object[] { engine, null, "[\u0000-\uFFFF-[\\p{P}\\p{S}\\p{C}]]+", "!@`';.,$+<>=\x0001\x001FazAZ09", RegexOptions.None, new string[] { "azAZ09" } };
+
+ yield return new object[] { engine, null, @"[\uFFFD-\uFFFF]+", "\uFFFC\uFFFD\uFFFE\uFFFF", RegexOptions.IgnoreCase, new string[] { "\uFFFD\uFFFE\uFFFF" } };
+ yield return new object[] { engine, null, @"[\uFFFC-\uFFFE]+", "\uFFFB\uFFFC\uFFFD\uFFFE\uFFFF", RegexOptions.IgnoreCase, new string[] { "\uFFFC\uFFFD\uFFFE" } };
+
+ // Empty Match
+ yield return new object[] { engine, null, @"([a*]*)+?$", "ab", RegexOptions.None, new string[] { "", "" } };
+ yield return new object[] { engine, null, @"(a*)+?$", "b", RegexOptions.None, new string[] { "", "" } };
+ }
}
public static IEnumerable<object[]> Groups_CustomCulture_TestData_enUS()
{
- yield return new object[] { "en-US", "CH", "Ch", RegexOptions.IgnoreCase, new string[] { "Ch" } };
- yield return new object[] { "en-US", "cH", "Ch", RegexOptions.IgnoreCase, new string[] { "Ch" } };
- yield return new object[] { "en-US", "AA", "Aa", RegexOptions.IgnoreCase, new string[] { "Aa" } };
- yield return new object[] { "en-US", "aA", "Aa", RegexOptions.IgnoreCase, new string[] { "Aa" } };
- yield return new object[] { "en-US", "\u0130", "\u0049", RegexOptions.IgnoreCase, new string[] { "\u0049" } };
- yield return new object[] { "en-US", "\u0130", "\u0069", RegexOptions.IgnoreCase, new string[] { "\u0069" } };
+ foreach (RegexEngine engine in RegexHelpers.AvailableEngines)
+ {
+ yield return new object[] { engine, "en-US", "CH", "Ch", RegexOptions.IgnoreCase, new string[] { "Ch" } };
+ yield return new object[] { engine, "en-US", "cH", "Ch", RegexOptions.IgnoreCase, new string[] { "Ch" } };
+ yield return new object[] { engine, "en-US", "AA", "Aa", RegexOptions.IgnoreCase, new string[] { "Aa" } };
+ yield return new object[] { engine, "en-US", "aA", "Aa", RegexOptions.IgnoreCase, new string[] { "Aa" } };
+ yield return new object[] { engine, "en-US", "\u0130", "\u0049", RegexOptions.IgnoreCase, new string[] { "\u0049" } };
+ yield return new object[] { engine, "en-US", "\u0130", "\u0069", RegexOptions.IgnoreCase, new string[] { "\u0069" } };
+ }
}
public static IEnumerable<object[]> Groups_CustomCulture_TestData_Czech()
{
- yield return new object[] { "cs-CZ", "CH", "Ch", RegexOptions.IgnoreCase, new string[] { "Ch" } };
- yield return new object[] { "cs-CZ", "cH", "Ch", RegexOptions.IgnoreCase, new string[] { "Ch" } };
+ foreach (RegexEngine engine in RegexHelpers.AvailableEngines)
+ {
+ yield return new object[] { engine, "cs-CZ", "CH", "Ch", RegexOptions.IgnoreCase, new string[] { "Ch" } };
+ yield return new object[] { engine, "cs-CZ", "cH", "Ch", RegexOptions.IgnoreCase, new string[] { "Ch" } };
+ }
}
public static IEnumerable<object[]> Groups_CustomCulture_TestData_Danish()
{
- yield return new object[] { "da-DK", "AA", "Aa", RegexOptions.IgnoreCase, new string[] { "Aa" } };
- yield return new object[] { "da-DK", "aA", "Aa", RegexOptions.IgnoreCase, new string[] { "Aa" } };
+ foreach (RegexEngine engine in RegexHelpers.AvailableEngines)
+ {
+ yield return new object[] { engine, "da-DK", "AA", "Aa", RegexOptions.IgnoreCase, new string[] { "Aa" } };
+ yield return new object[] { engine, "da-DK", "aA", "Aa", RegexOptions.IgnoreCase, new string[] { "Aa" } };
+ }
}
public static IEnumerable<object[]> Groups_CustomCulture_TestData_Turkish()
{
- yield return new object[] { "tr-TR", "\u0131", "\u0049", RegexOptions.IgnoreCase, new string[] { "\u0049" } };
- yield return new object[] { "tr-TR", "\u0130", "\u0069", RegexOptions.IgnoreCase, new string[] { "\u0069" } };
+ foreach (RegexEngine engine in RegexHelpers.AvailableEngines)
+ {
+ yield return new object[] { engine, "tr-TR", "\u0131", "\u0049", RegexOptions.IgnoreCase, new string[] { "\u0049" } };
+ yield return new object[] { engine, "tr-TR", "\u0130", "\u0069", RegexOptions.IgnoreCase, new string[] { "\u0069" } };
+ }
}
public static IEnumerable<object[]> Groups_CustomCulture_TestData_AzeriLatin()
{
- if (PlatformDetection.IsNotBrowser)
+ foreach (RegexEngine engine in RegexHelpers.AvailableEngines)
{
- yield return new object[] { "az-Latn-AZ", "\u0131", "\u0049", RegexOptions.IgnoreCase, new string[] { "\u0049" } };
- yield return new object[] { "az-Latn-AZ", "\u0130", "\u0069", RegexOptions.IgnoreCase, new string[] { "\u0069" } };
+ if (PlatformDetection.IsNotBrowser)
+ {
+ yield return new object[] { engine, "az-Latn-AZ", "\u0131", "\u0049", RegexOptions.IgnoreCase, new string[] { "\u0049" } };
+ yield return new object[] { engine, "az-Latn-AZ", "\u0130", "\u0069", RegexOptions.IgnoreCase, new string[] { "\u0069" } };
+ }
}
}
[MemberData(nameof(Groups_CustomCulture_TestData_AzeriLatin))]
[ActiveIssue("https://github.com/dotnet/runtime/issues/56407", TestPlatforms.Android)]
[ActiveIssue("https://github.com/dotnet/runtime/issues/36900", TestPlatforms.iOS | TestPlatforms.tvOS | TestPlatforms.MacCatalyst)]
- public async Task Groups(string cultureName, string pattern, string input, RegexOptions options, string[] expectedGroups, string altMatch = null)
+ public async Task Groups(RegexEngine engine, string cultureName, string pattern, string input, RegexOptions options, string[] expectedGroups, string altMatch = null)
{
if (cultureName is null)
{
cultureName = culture.Equals(CultureInfo.InvariantCulture) ? "en-US" : culture.Name;
}
- using (new ThreadCultureChange(cultureName))
- {
- foreach (RegexEngine engine in RegexHelpers.AvailableEngines)
- {
- // Alternative altMatch when order of alternations matters in backtracking but order does not matter in NonBacktracking mode
- // Also in NonBacktracking there is only a single top-level match, which is expectedGroups[0] when altMatch is null
- string[] expected = engine == RegexEngine.NonBacktracking ?
- new string[] { altMatch ?? expectedGroups[0] } :
- expectedGroups;
+ // Alternative altMatch when order of alternations matters in backtracking but order does not matter in NonBacktracking mode
+ // Also in NonBacktracking there is only a single top-level match, which is expectedGroups[0] when altMatch is null
+ expectedGroups = engine == RegexEngine.NonBacktracking ?
+ new string[] { altMatch ?? expectedGroups[0] } :
+ expectedGroups;
- await GroupsAsync(engine, pattern, input, options, expected);
- }
+ if (engine == RegexEngine.NonBacktracking && pattern.Contains("?(cat)"))
+ {
+ // General if-then-else construct is not supported and uses the ?(cat) condition in the tests
+ // TODO-NONBACKTRACKING: The constructor will throw NotSupportedException so this check will become obsolete
+ return;
}
- static async Task GroupsAsync(RegexEngine engine, string pattern, string input, RegexOptions options, string[] expectedGroups)
+ using var _ = new ThreadCultureChange(cultureName);
+
+ Regex regex;
+ try
{
- if (engine == RegexEngine.NonBacktracking && pattern.Contains("?(cat)"))
- {
- // General if-then-else construct is not supported and uses the ?(cat) condition in the tests
- // TODO-NONBACKTRACKING: The constructor will throw NotSupportedException so this check will become obsolete
- return;
- }
+ regex = await RegexHelpers.GetRegexAsync(engine, pattern, options);
+ }
+ catch (NotSupportedException) when (RegexHelpers.IsNonBacktracking(engine))
+ {
+ // Some constructs are not supported in NonBacktracking mode, such as: if-then-else, lookaround, and backreferences
+ return;
+ }
- Regex regex;
- try
- {
- regex = await RegexHelpers.GetRegexAsync(engine, pattern, options);
- }
- catch (NotSupportedException) when (RegexHelpers.IsNonBacktracking(engine))
- {
- // Some constructs are not supported in NonBacktracking mode, such as: if-then-else, lookaround, and backreferences
- return;
- }
+ Match match = regex.Match(input);
- Match match = regex.Match(input);
+ Assert.True(match.Success);
+ Assert.Equal(expectedGroups[0], match.Value);
- Assert.True(match.Success);
- Assert.Equal(expectedGroups[0], match.Value);
+ if (!RegexHelpers.IsNonBacktracking(engine))
+ {
+ Assert.Equal(expectedGroups.Length, match.Groups.Count);
- if (!RegexHelpers.IsNonBacktracking(engine))
+ int[] groupNumbers = regex.GetGroupNumbers();
+ string[] groupNames = regex.GetGroupNames();
+ for (int i = 0; i < expectedGroups.Length; i++)
{
- Assert.Equal(expectedGroups.Length, match.Groups.Count);
-
- int[] groupNumbers = regex.GetGroupNumbers();
- string[] groupNames = regex.GetGroupNames();
- for (int i = 0; i < expectedGroups.Length; i++)
- {
- Assert.Equal(expectedGroups[i], match.Groups[groupNumbers[i]].Value);
- Assert.Equal(match.Groups[groupNumbers[i]], match.Groups[groupNames[i]]);
-
- Assert.Equal(groupNumbers[i], regex.GroupNumberFromName(groupNames[i]));
- Assert.Equal(groupNames[i], regex.GroupNameFromNumber(groupNumbers[i]));
- }
+ Assert.Equal(expectedGroups[i], match.Groups[groupNumbers[i]].Value);
+ Assert.Equal(match.Groups[groupNumbers[i]], match.Groups[groupNames[i]]);
+
+ Assert.Equal(groupNumbers[i], regex.GroupNumberFromName(groupNames[i]));
+ Assert.Equal(groupNames[i], regex.GroupNameFromNumber(groupNumbers[i]));
}
}
}
{
yield return ("aaa(?i:match this)bbb", "aaaMaTcH ThIsbbb", RegexOptions.None, 0, 16, true, "aaaMaTcH ThIsbbb");
}
+ yield return ("(?i:a)b(?i:c)d", "aaaaAbCdddd", RegexOptions.None, 0, 11, true, "AbCd");
+ yield return ("(?i:[\u0000-\u1000])[Bb]", "aaaaAbCdddd", RegexOptions.None, 0, 11, true, "Ab");
// Turning off case insensitive option in mid-pattern : Actual - "aaa(?-i:match this)bbb", "i"
yield return ("aAa(?-i:match this)bbb", "AaAmatch thisBBb", RegexOptions.IgnoreCase, 0, 16, true, "AaAmatch thisBBb");
yield return (@"\p{Ll}", "1bc", RegexOptions.IgnoreCase, 0, 3, true, "b");
yield return (@"\p{Lt}", "1bc", RegexOptions.IgnoreCase, 0, 3, true, "b");
yield return (@"\p{Lo}", "1bc", RegexOptions.IgnoreCase, 0, 3, false, string.Empty);
+ yield return (".[abc]", "xYZAbC", RegexOptions.IgnoreCase, 0, 6, true, "ZA");
+ yield return (".[abc]", "xYzXyZx", RegexOptions.IgnoreCase, 0, 6, false, "");
// "\D+"
yield return (@"\D+", "12321", RegexOptions.None, 0, 5, false, string.Empty);
yield return ("(?>(?:a|ab|abc|abcd))d", "abcd", RegexOptions.RightToLeft, 0, 4, true, "abcd");
}
yield return ("[^a-z0-9]etag|[^a-z0-9]digest", "this string has .digest as a substring", RegexOptions.None, 16, 7, true, ".digest");
- yield return (@"a\w*a|def", "aaaaa", RegexOptions.None, 0, 5, true, "aaaaa");
// No Negation
yield return ("[abcd-[abcd]]+", "abcxyzABCXYZ`!@#$%^&*()_-+= \t\n", RegexOptions.None, 0, 30, false, string.Empty);
yield return (@".*\dFoo", "This1foo should 2FoO match", RegexOptions.IgnoreCase | RegexOptions.RightToLeft, 0, 26, true, "This1foo should 2FoO");
yield return (@".*\dFoo", "This1Foo should 2fOo match", RegexOptions.IgnoreCase | RegexOptions.RightToLeft, 0, 26, true, "This1Foo should 2fOo");
yield return (@".*\dfoo", "1fooThis2FOO should 1foo match", RegexOptions.IgnoreCase | RegexOptions.RightToLeft, 8, 4, true, "2FOO");
+ yield return (@"[\w\s].*", "1fooThis2FOO should 1foo match", RegexOptions.IgnoreCase | RegexOptions.RightToLeft, 0, 30, true, "1fooThis2FOO should 1foo match");
+ yield return (@"i.*", "1fooThis2FOO should 1foo match", RegexOptions.IgnoreCase | RegexOptions.RightToLeft, 0, 30, true, "is2FOO should 1foo match");
}
// [ActiveIssue("https://github.com/dotnet/runtime/issues/36149")]
// yield return (@"^(?i:[\u24B6-\u24D0])$", ((char)('\u24CF' + 26)).ToString(), RegexOptions.IgnoreCase | RegexOptions.CultureInvariant, 0, 1, true, ((char)('\u24CF' + 26)).ToString());
//}
+ // Long inputs
+ string longCharacterRange = string.Concat(Enumerable.Range(1, 0x2000).Select(c => (char)c));
+ foreach (RegexOptions options in new[] { RegexOptions.None, RegexOptions.IgnoreCase })
+ {
+ yield return ("\u1000", longCharacterRange, options, 0, 0x2000, true, "\u1000");
+ yield return ("[\u1000-\u1001]", longCharacterRange, options, 0, 0x2000, true, "\u1000");
+ yield return ("[\u0FF0-\u0FFF][\u1000-\u1001]", longCharacterRange, options, 0, 0x2000, true, "\u0FFF\u1000");
+
+ yield return ("\uA640", longCharacterRange, options, 0, 0x2000, false, "");
+ yield return ("[\u3000-\u3001]", longCharacterRange, options, 0, 0x2000, false, "");
+ yield return ("[\uA640-\uA641][\u3000-\u3010]", longCharacterRange, options, 0, 0x2000, false, "");
+
+ if (!RegexHelpers.IsNonBacktracking(engine))
+ {
+ yield return ("\u1000", longCharacterRange, options | RegexOptions.RightToLeft, 0, 0x2000, true, "\u1000");
+ yield return ("[\u1000-\u1001]", longCharacterRange, options | RegexOptions.RightToLeft, 0, 0x2000, true, "\u1001");
+ yield return ("[\u1000][\u1001-\u1010]", longCharacterRange, options, 0, 0x2000, true, "\u1000\u1001");
+
+ yield return ("\uA640", longCharacterRange, options | RegexOptions.RightToLeft, 0, 0x2000, false, "");
+ yield return ("[\u3000-\u3001][\uA640-\uA641]", longCharacterRange, options | RegexOptions.RightToLeft, 0, 0x2000, false, "");
+ }
+ }
+
foreach (RegexOptions options in new[] { RegexOptions.None, RegexOptions.Singleline })
{
yield return (@"\W.*?\D", "seq 012 of 3 digits", options, 0, 19, true, " 012 ");
// Repeaters
Assert.False((await RegexHelpers.GetRegexAsync(engine, @"a{2147483647,}")).IsMatch("a"));
- Assert.False((await RegexHelpers.GetRegexAsync(engine, @"a{50,}")).IsMatch("a")); // cutoff for Boyer-Moore prefix in debug
- Assert.False((await RegexHelpers.GetRegexAsync(engine, @"a{51,}")).IsMatch("a"));
+ Assert.False((await RegexHelpers.GetRegexAsync(engine, @"a{50,}")).IsMatch("a"));
Assert.False((await RegexHelpers.GetRegexAsync(engine, @"a{50_000,}")).IsMatch("a")); // cutoff for Boyer-Moore prefix in release
- Assert.False((await RegexHelpers.GetRegexAsync(engine, @"a{50_001,}")).IsMatch("a"));
// Multis
- foreach (int length in new[] { 50, 51, 50_000, 50_001, char.MaxValue + 1 }) // based on knowledge of cut-offs used in Boyer-Moore
+ foreach (int length in new[] { 50, 50_000, char.MaxValue + 1 })
{
// The large counters are too slow for counting a's in NonBacktracking engine
// They will incur a constant of size length because in .*a{k} after reading n a's the
return start == 0;
}
- public static Regex CreateRegexInCulture(string pattern, RegexOptions options, Globalization.CultureInfo culture)
+ public static async Task<Regex> GetRegexAsync(RegexEngine engine, string pattern, RegexOptions options, Globalization.CultureInfo culture)
{
using (new System.Tests.ThreadCultureChange(culture))
{
- return new Regex(pattern, options);
+ return await GetRegexAsync(engine, pattern, options);
}
}
// - Handle NonBacktrackingSourceGenerated
return
- options is null ? new Regex(pattern, RegexOptions.Compiled | OptionsFromEngine(engine)) :
+ options is null ? new Regex(pattern, OptionsFromEngine(engine)) :
matchTimeout is null ? new Regex(pattern, options.Value | OptionsFromEngine(engine)) :
new Regex(pattern, options.Value | OptionsFromEngine(engine), matchTimeout.Value);
}
{
(string pattern, RegexOptions? options, TimeSpan? matchTimeout) = regexes[i];
results[i] =
- options is null ? new Regex(pattern, RegexOptions.Compiled | OptionsFromEngine(engine)) :
+ options is null ? new Regex(pattern, OptionsFromEngine(engine)) :
matchTimeout is null ? new Regex(pattern, options.Value | OptionsFromEngine(engine)) :
new Regex(pattern, options.Value | OptionsFromEngine(engine), matchTimeout.Value);
}
{
public class RegexCultureTests
{
- // TODO: Validate source generator after figuring out what to do with culture
-
- public static IEnumerable<RegexOptions> RegexOptionsExtended()
- {
- yield return RegexOptions.None;
- yield return RegexOptions.Compiled;
- if (PlatformDetection.IsNetCore)
- {
- yield return RegexHelpers.RegexOptionNonBacktracking;
- }
- }
-
- public static IEnumerable<object[]> RegexOptionsExtended_MemberData() =>
- from options in RegexOptionsExtended()
- select new object[] { options };
-
public static IEnumerable<object[]> CharactersComparedOneByOne_AnchoredPattern_TestData()
{
- foreach (RegexOptions options in RegexOptionsExtended())
+ foreach (RegexEngine engine in RegexHelpers.AvailableEngines)
{
- yield return new object[] { "^aa$", "aA", "da-DK", options, false };
- yield return new object[] { "^aA$", "aA", "da-DK", options, true };
- yield return new object[] { "^aa$", "aA", "da-DK", options | RegexOptions.IgnoreCase, true };
- yield return new object[] { "^aA$", "aA", "da-DK", options | RegexOptions.IgnoreCase, true };
+ yield return new object[] { engine, "^aa$", "aA", "da-DK", RegexOptions.None, false };
+ yield return new object[] { engine, "^aA$", "aA", "da-DK", RegexOptions.None, true };
+ yield return new object[] { engine, "^aa$", "aA", "da-DK", RegexOptions.IgnoreCase, true };
+ yield return new object[] { engine, "^aA$", "aA", "da-DK", RegexOptions.IgnoreCase, true };
}
}
[Theory]
[MemberData(nameof(CharactersComparedOneByOne_AnchoredPattern_TestData))]
- public void CharactersComparedOneByOne_AnchoredPattern(string pattern, string input, string culture, RegexOptions options, bool expected)
+ public async Task CharactersComparedOneByOne_AnchoredPattern(RegexEngine engine, string pattern, string input, string culture, RegexOptions options, bool expected)
{
// Regex compares characters one by one. If that changes, it could impact the behavior of
// a case like this, where these characters are not the same, but the strings compare
// as equal with the invariant culture (and some other cultures as well).
using (new ThreadCultureChange(culture))
{
- foreach (RegexOptions compiled in new[] { RegexOptions.None, RegexOptions.Compiled })
- {
- Assert.Equal(expected, new Regex(pattern, options | compiled).IsMatch(input));
- }
+ Regex r = await RegexHelpers.GetRegexAsync(engine, pattern, options);
+ Assert.Equal(expected, r.IsMatch(input));
}
}
-
public static IEnumerable<object[]> CharactersComparedOneByOne_Invariant_TestData()
{
- foreach (RegexOptions options in RegexOptionsExtended())
+ foreach (RegexEngine engine in RegexHelpers.AvailableEngines)
{
- yield return new object[] { options };
- yield return new object[] { options | RegexOptions.IgnoreCase | RegexOptions.CultureInvariant };
+ yield return new object[] { engine, RegexOptions.None };
+ yield return new object[] { engine, RegexOptions.IgnoreCase | RegexOptions.CultureInvariant };
}
}
[Theory]
[MemberData(nameof(CharactersComparedOneByOne_Invariant_TestData))]
- public void CharactersComparedOneByOne_Invariant(RegexOptions options)
+ public async Task CharactersComparedOneByOne_Invariant(RegexEngine engine, RegexOptions options)
{
// Regex compares characters one by one. If that changes, it could impact the behavior of
// a case like this, where these characters are not the same, but the strings compare
string input = string.Concat(Enumerable.Repeat(S2, multiple));
Regex r;
- // Validate when the string is at the beginning of the pattern, as it impacts Boyer-Moore prefix matching.
- r = new Regex(pattern, options);
+ // Validate when the string is at the beginning of the pattern, as it impacts prefix matching.
+ r = await RegexHelpers.GetRegexAsync(engine, pattern, options);
Assert.False(r.IsMatch(input));
Assert.True(r.IsMatch(pattern));
// Validate when it's not at the beginning of the pattern, as it impacts "multi" matching.
- r = new Regex("[abc]" + pattern, options);
+ r = await RegexHelpers.GetRegexAsync(engine, "[abc]" + pattern, options);
Assert.False(r.IsMatch("a" + input));
Assert.True(r.IsMatch("a" + pattern));
}
}
- public static IEnumerable<object[]> CharactersLowercasedOneByOne_MemberData()
- {
- foreach (RegexEngine engine in RegexHelpers.AvailableEngines)
- {
- switch (engine)
- {
- case RegexEngine.SourceGenerated:
- case RegexEngine.NonBacktrackingSourceGenerated:
- continue;
- }
-
- yield return new object[] { engine };
- }
- }
-
[Theory]
- [MemberData(nameof(CharactersLowercasedOneByOne_MemberData))]
+ [MemberData(nameof(RegexHelpers.AvailableEngines_MemberData), MemberType = typeof(RegexHelpers))]
public async Task CharactersLowercasedOneByOne(RegexEngine engine)
{
using (new ThreadCultureChange("en-US"))
[SkipOnTargetFramework(TargetFrameworkMonikers.NetFramework, "Doesn't support NonBacktracking")]
[Fact]
[ActiveIssue("https://github.com/dotnet/runtime/issues/60568", TestPlatforms.Android)]
- public void TurkishI_Is_Differently_LowerUpperCased_In_Turkish_Culture_NonBacktracking()
+ public async Task TurkishI_Is_Differently_LowerUpperCased_In_Turkish_Culture_NonBacktracking()
{
var turkish = new CultureInfo("tr-TR");
string input = "I\u0131\u0130i";
// Use the input as the regex also
// Ignore the Compiled option here because it is a noop in combination with NonBacktracking
- Regex cultInvariantRegex = RegexHelpers.CreateRegexInCulture(input, RegexHelpers.RegexOptionNonBacktracking | RegexOptions.IgnoreCase | RegexOptions.CultureInvariant, CultureInfo.InvariantCulture);
- Regex turkishRegex = RegexHelpers.CreateRegexInCulture(input, RegexHelpers.RegexOptionNonBacktracking | RegexOptions.IgnoreCase, turkish);
+ Regex cultInvariantRegex = await RegexHelpers.GetRegexAsync(RegexEngine.NonBacktracking, input, RegexOptions.IgnoreCase | RegexOptions.CultureInvariant, CultureInfo.InvariantCulture);
+ Regex turkishRegex = await RegexHelpers.GetRegexAsync(RegexEngine.NonBacktracking, input, RegexOptions.IgnoreCase, turkish);
Assert.True(cultInvariantRegex.IsMatch(input));
Assert.True(turkishRegex.IsMatch(input)); // <---------- This result differs from the result in the previous test!!!
Assert.True(turkishRegex.IsMatch(input.ToUpper(turkish)));
}
- [ActiveIssue("Incorrect handling of IgnoreCase over intervals in Turkish Culture, https://github.com/dotnet/runtime/issues/58958")]
- [Fact]
- public void TurkishCulture_Handling_Of_IgnoreCase()
+ [ActiveIssue("https://github.com/dotnet/runtime/issues/58958")]
+ [Theory]
+ [MemberData(nameof(RegexHelpers.AvailableEngines_MemberData), MemberType = typeof(RegexHelpers))]
+ public async Task TurkishCulture_Handling_Of_IgnoreCase(RegexEngine engine)
{
var turkish = new CultureInfo("tr-TR");
string input = "I\u0131\u0130i";
string pattern = "[H-J][\u0131-\u0140][\u0120-\u0130][h-j]";
- Regex regex = RegexHelpers.CreateRegexInCulture(pattern, RegexOptions.IgnoreCase, turkish);
+ Regex regex = await RegexHelpers.GetRegexAsync(engine, pattern, RegexOptions.IgnoreCase, turkish);
// The pattern must trivially match the input because all of the letters fall in the given intervals
// Ignoring case can only add more letters here -- not REMOVE letters
Assert.True(regex.IsMatch(input));
}
- [SkipOnTargetFramework(TargetFrameworkMonikers.NetFramework, "Doesn't support NonBacktracking")]
- [Fact]
- public void TurkishCulture_Handling_Of_IgnoreCase_NonBacktracking()
+ public static IEnumerable<object[]> TurkishCulture_MatchesWordChar_MemberData()
{
- var turkish = new CultureInfo("tr-TR");
- string input = "I\u0131\u0130i";
- string pattern = "[H-J][\u0131-\u0140][\u0120-\u0130][h-j]";
-
- Regex regex = RegexHelpers.CreateRegexInCulture(pattern, RegexOptions.IgnoreCase | RegexHelpers.RegexOptionNonBacktracking, turkish);
+ foreach (RegexEngine engine in RegexHelpers.AvailableEngines)
+ {
+ yield return new object[] { engine, "I\u0131\u0130i", RegexOptions.None, "I\u0131\u0130i" };
+ yield return new object[] { engine, "I\u0131\u0130i", RegexOptions.IgnoreCase, "I\u0131\u0130i" };
+ if (!RegexHelpers.IsNonBacktracking(engine))
+ {
+ yield return new object[] { engine, "I\u0131\u0130i", RegexOptions.IgnoreCase | RegexOptions.ECMAScript, "" };
+ }
+ }
+ }
- // The pattern must trivially match the input because all of the letters fall in the given intervals
- // Ignoring case can only add more letters here -- not REMOVE letters
- Assert.True(regex.IsMatch(input));
+ [Theory]
+ [MemberData(nameof(TurkishCulture_MatchesWordChar_MemberData))]
+ public async Task TurkishCulture_MatchesWordChar(RegexEngine engine, string input, RegexOptions options, string expectedResult)
+ {
+ using (new ThreadCultureChange(new CultureInfo("tr-TR")))
+ {
+ Regex regex = await RegexHelpers.GetRegexAsync(engine, @"\w*", options);
+ Assert.Equal(expectedResult, regex.Match(input).Value);
+ }
}
public static IEnumerable<object[]> Match_In_Different_Cultures_TestData()
{
CultureInfo invariant = CultureInfo.InvariantCulture;
- CultureInfo current = CultureInfo.CurrentCulture;
+ CultureInfo enUS = new CultureInfo("en-US");
CultureInfo turkish = new CultureInfo("tr-TR");
- foreach (RegexOptions options in RegexOptionsExtended())
+ foreach (RegexEngine engine in RegexHelpers.AvailableEngines)
{
// \u0130 (Turkish I with dot) and \u0131 (Turkish i without dot) are unrelated characters in general
// Expected answers in the default en-US culture
- yield return new object[] { "(?i:I)", options, current, "xy\u0131ab", "" };
- yield return new object[] { "(?i:iI+)", options, current, "abcIIIxyz", "III" };
- yield return new object[] { "(?i:iI+)", options, current, "abcIi\u0130xyz", "Ii\u0130" };
- yield return new object[] { "(?i:iI+)", options, current, "abcI\u0130ixyz", "I\u0130i" };
- yield return new object[] { "(?i:iI+)", options, current, "abc\u0130IIxyz", "\u0130II" };
- yield return new object[] { "(?i:iI+)", options, current, "abc\u0130\u0131Ixyz", "" };
- yield return new object[] { "(?i:iI+)", options, current, "abc\u0130Iixyz", "\u0130Ii" };
- yield return new object[] { "(?i:[^IJKLM]I)", options, current, "ii\u0130i\u0131ab", "" };
+ yield return new object[] { "(?i:I)", engine, enUS, "xy\u0131ab", "" };
+ yield return new object[] { "(?i:iI+)", engine, enUS, "abcIIIxyz", "III" };
+ yield return new object[] { "(?i:iI+)", engine, enUS, "abcIi\u0130xyz", "Ii\u0130" };
+ yield return new object[] { "(?i:iI+)", engine, enUS, "abcI\u0130ixyz", "I\u0130i" };
+ yield return new object[] { "(?i:iI+)", engine, enUS, "abc\u0130IIxyz", "\u0130II" };
+ yield return new object[] { "(?i:iI+)", engine, enUS, "abc\u0130\u0131Ixyz", "" };
+ yield return new object[] { "(?i:iI+)", engine, enUS, "abc\u0130Iixyz", "\u0130Ii" };
+ yield return new object[] { "(?i:[^IJKLM]I)", engine, enUS, "ii\u0130i\u0131ab", "" };
// Expected answers in the invariant culture
- yield return new object[] { "(?i:I)", options, invariant, "xy\u0131ab", "" };
- yield return new object[] { "(?i:iI+)", options, invariant, "abcIIIxyz", "III" };
- yield return new object[] { "(?i:iI+)", options, invariant, "abc\u0130\u0131Ixyz", "" };
+ yield return new object[] { "(?i:I)", engine, invariant, "xy\u0131ab", "" };
+ yield return new object[] { "(?i:iI+)", engine, invariant, "abcIIIxyz", "III" };
+ yield return new object[] { "(?i:iI+)", engine, invariant, "abc\u0130\u0131Ixyz", "" };
// Expected answers in the Turkish culture
//
// https://github.com/dotnet/runtime/issues/60568
if (!PlatformDetection.IsAndroid)
{
- yield return new object[] { "(?i:I)", options, turkish, "xy\u0131ab", "\u0131" };
- yield return new object[] { "(?i:iI+)", options, turkish, "abcIIIxyz", "" };
- yield return new object[] { "(?i:iI+)", options, turkish, "abcIi\u0130xyz", "" };
- yield return new object[] { "(?i:iI+)", options, turkish, "abcI\u0130ixyz", "" };
- yield return new object[] { "(?i:[^IJKLM]I)", options, turkish, "ii\u0130i\u0131ab", "i\u0131" };
+ yield return new object[] { "(?i:I)", engine, turkish, "xy\u0131ab", "\u0131" };
+ yield return new object[] { "(?i:iI+)", engine, turkish, "abcIIIxyz", "" };
+ yield return new object[] { "(?i:iI+)", engine, turkish, "abcIi\u0130xyz", "" };
+ yield return new object[] { "(?i:iI+)", engine, turkish, "abcI\u0130ixyz", "" };
+ yield return new object[] { "(?i:[^IJKLM]I)", engine, turkish, "ii\u0130i\u0131ab", "i\u0131" };
}
// None and Compiled are separated into the Match_In_Different_Cultures_CriticalCases test
- if (options == RegexHelpers.RegexOptionNonBacktracking)
+ if (RegexHelpers.IsNonBacktracking(engine))
{
- foreach (object[] data in Match_In_Different_Cultures_CriticalCases_TestData_For(options))
+ foreach (object[] data in Match_In_Different_Cultures_CriticalCases_TestData_For(engine))
{
yield return data;
}
}
}
- public static IEnumerable<object[]> Match_In_Different_Cultures_CriticalCases_TestData_For(RegexOptions options)
+ public static IEnumerable<object[]> Match_In_Different_Cultures_CriticalCases_TestData_For(RegexEngine engine)
{
CultureInfo invariant = CultureInfo.InvariantCulture;
CultureInfo turkish = new CultureInfo("tr-TR");
// Expected answers in the invariant culture
- yield return new object[] { "(?i:iI+)", options, invariant, "abcIi\u0130xyz", "Ii" }; // <-- failing for None, Compiled
- yield return new object[] { "(?i:iI+)", options, invariant, "abcI\u0130ixyz", "" }; // <-- failing for Compiled
- yield return new object[] { "(?i:iI+)", options, invariant, "abc\u0130IIxyz", "II" }; // <-- failing for Compiled
- yield return new object[] { "(?i:iI+)", options, invariant, "abc\u0130Iixyz", "Ii" }; // <-- failing for Compiled
- yield return new object[] { "(?i:[^IJKLM]I)", options, invariant, "ii\u0130i\u0131ab", "\u0130i" }; // <-- failing for None, Compiled
+ yield return new object[] { "(?i:iI+)", engine, invariant, "abcIi\u0130xyz", "Ii" }; // <-- failing for None, Compiled
+ yield return new object[] { "(?i:iI+)", engine, invariant, "abcI\u0130ixyz", "" }; // <-- failing for Compiled
+ yield return new object[] { "(?i:iI+)", engine, invariant, "abc\u0130IIxyz", "II" }; // <-- failing for Compiled
+ yield return new object[] { "(?i:iI+)", engine, invariant, "abc\u0130Iixyz", "Ii" }; // <-- failing for Compiled
+ yield return new object[] { "(?i:[^IJKLM]I)", engine, invariant, "ii\u0130i\u0131ab", "\u0130i" }; // <-- failing for None, Compiled
// Expected answers in the Turkish culture
// Android produces unexpected results for tr-TR
// https://github.com/dotnet/runtime/issues/60568
if (!PlatformDetection.IsAndroid)
{
- yield return new object[] { "(?i:iI+)", options, turkish, "abc\u0130IIxyz", "\u0130II" }; // <-- failing for None, Compiled
- yield return new object[] { "(?i:iI+)", options, turkish, "abc\u0130\u0131Ixyz", "\u0130\u0131I" }; // <-- failing for None, Compiled
- yield return new object[] { "(?i:iI+)", options, turkish, "abc\u0130Iixyz", "\u0130I" }; // <-- failing for None, Compiled
+ yield return new object[] { "(?i:iI+)", engine, turkish, "abc\u0130IIxyz", "\u0130II" }; // <-- failing for None, Compiled
+ yield return new object[] { "(?i:iI+)", engine, turkish, "abc\u0130\u0131Ixyz", "\u0130\u0131I" }; // <-- failing for None, Compiled
+ yield return new object[] { "(?i:iI+)", engine, turkish, "abc\u0130Iixyz", "\u0130I" }; // <-- failing for None, Compiled
}
}
public static IEnumerable<object[]> Match_In_Different_Cultures_CriticalCases_TestData() =>
- Match_In_Different_Cultures_CriticalCases_TestData_For(RegexOptions.None).Union(Match_In_Different_Cultures_CriticalCases_TestData_For(RegexOptions.Compiled));
+ Match_In_Different_Cultures_CriticalCases_TestData_For(RegexEngine.Interpreter).Union(Match_In_Different_Cultures_CriticalCases_TestData_For(RegexEngine.Compiled));
[ActiveIssue("https://github.com/dotnet/runtime/issues/60899", TestPlatforms.Browser)]
+ [ActiveIssue("https://github.com/dotnet/runtime/issues/60697", TestPlatforms.iOS | TestPlatforms.tvOS)]
[Theory]
[MemberData(nameof(Match_In_Different_Cultures_TestData))]
- [ActiveIssue("https://github.com/dotnet/runtime/issues/60697", TestPlatforms.iOS | TestPlatforms.tvOS)]
- public void Match_In_Different_Cultures(string pattern, RegexOptions options, CultureInfo culture, string input, string match_expected)
+ public async Task Match_In_Different_Cultures(string pattern, RegexEngine engine, CultureInfo culture, string input, string match_expected)
{
- Regex r = RegexHelpers.CreateRegexInCulture(pattern, options, culture);
+ Regex r = await RegexHelpers.GetRegexAsync(engine, pattern, RegexOptions.None, culture);
Match match = r.Match(input);
Assert.Equal(match_expected, match.Value);
}
[ActiveIssue("Incorrect treatment of IgnoreCase in Turkish and Invariant cultures, https://github.com/dotnet/runtime/issues/58956, https://github.com/dotnet/runtime/issues/58958 ")]
[Theory]
[MemberData(nameof(Match_In_Different_Cultures_CriticalCases_TestData))]
- public void Match_In_Different_Cultures_CriticalCases(string pattern, RegexOptions options, CultureInfo culture, string input, string match_expected)
+ public async Task Match_In_Different_Cultures_CriticalCases(string pattern, RegexEngine engine, CultureInfo culture, string input, string match_expected)
{
- Regex r = RegexHelpers.CreateRegexInCulture(pattern, options, culture);
+ Regex r = await RegexHelpers.GetRegexAsync(engine, pattern, RegexOptions.None, culture);
Match match = r.Match(input);
Assert.Equal(match_expected, match.Value);
}
/// </summary>
[OuterLoop("May take several seconds due to large number of cultures tested")]
[SkipOnTargetFramework(TargetFrameworkMonikers.NetFramework)]
- [Theory]
- [MemberData(nameof(RegexOptionsExtended_MemberData))]
- public void TestIgnoreCaseRelation(RegexOptions options)
+ [Fact]
+ public void TestIgnoreCaseRelation()
{
// these 22 characters are considered case-insensitive by regex, while they are case-sensitive outside regex
// but they are only case-sensitive in an asymmmetrical way: tolower(c)=c, tolower(toupper(c)) != c
{
char cU = char.ToUpper(c);
Assert.NotEqual(c, cU);
- Assert.False(Regex.IsMatch(c.ToString(), cU.ToString(), options | RegexOptions.IgnoreCase));
+ Assert.False(Regex.IsMatch(c.ToString(), cU.ToString(), RegexOptions.IgnoreCase));
}
- Assert.False(Regex.IsMatch(Turkish_i_withoutDot.ToString(), "i", options | RegexOptions.IgnoreCase));
+ Assert.False(Regex.IsMatch(Turkish_i_withoutDot.ToString(), "i", RegexOptions.IgnoreCase));
// as baseline it is assumed the the invariant culture does not change
HashSet<char>[] inv_table = ComputeIgnoreCaseTable(CultureInfo.InvariantCulture, treatedAsCaseInsensitive);
/// <summary>Output directory for generated dgml files.</summary>
private static string DgmlOutputDirectoryPath => Path.Combine(s_tmpWorkingDir, "dgml");
- private static string ExperimentDirectoryPath => Path.Combine(s_tmpWorkingDir, "experiments");
-
- [ConditionalFact(nameof(Enabled))]
+ [Fact]
public void RegenerateUnicodeTables()
{
+ if (!Enabled)
+ {
+ return;
+ }
+
MethodInfo? genUnicode = typeof(Regex).GetMethod("GenerateUnicodeTables", BindingFlags.NonPublic | BindingFlags.Static);
// GenerateUnicodeTables is not available in Release build
if (genUnicode is not null)
}
}
- private static void WriteOutput(string message) =>
- File.AppendAllText(OutputFilePath, message);
-
/// <summary>Save the regex as a DFA in DGML format in the textwriter.</summary>
private static bool TrySaveDGML(Regex regex, TextWriter writer, int bound = -1, bool hideStateInfo = false, bool addDotStar = false, bool inReverse = false, bool onlyDFAinfo = false, int maxLabelLength = -1, bool asNFA = false)
{
}
}
- /// <summary>
- /// The intent is that this method is run in realease build for lightweight performance testing.
- /// One can e.g. open the outputfile in emacs with AUTO-REVERT-ON in order to follow the progress in real time.
- /// It will print timing info and match info for both DFA, Compiled option and None.
- /// Place sample regexes in the regexesfile (one per line) and sample input in inputfile.
- /// It will essentially produce a csv file with the info:
- /// regexnr, matchtime_DFA, result_DFA, matchtime_Compiled, result_Compiled, matchtime_None, result_None,
- /// where result_.. is one of
- /// Yes(index,length)
- /// No
- /// TIMEOUT
- /// ERROR
- /// and in the case of TIMEOUT or ERROR time is 10000 (the timeout limit of 10sec)
- /// </summary>
- [ConditionalFact(nameof(Enabled))]
- public void TestRunPerformance()
- {
- if (!Directory.Exists(ExperimentDirectoryPath))
- {
- Directory.CreateDirectory(ExperimentDirectoryPath);
- }
-
- string[] dirs = Directory.GetDirectories(ExperimentDirectoryPath);
- if (dirs.Length == 0)
- {
- WriteOutput("\nExperiments directory is empty");
- return;
- }
-
- DirectoryInfo experimentDI = Directory.GetParent(dirs[0]);
- DirectoryInfo[] experiments =
- Array.FindAll(experimentDI.GetDirectories(),
- di => ((di.Attributes & FileAttributes.Hidden) != (FileAttributes.Hidden)) &&
- Array.Exists(di.GetFiles(), f => f.Name.Equals("regexes.txt")) &&
- Array.Exists(di.GetFiles(), f => f.Name.Equals("input.txt")));
- if (experiments.Length == 0)
- {
- WriteOutput("\nExperiments directory has no indiviual experiment subdirectories containing files 'regexes.txt' and 'input.txt'.");
- return;
- }
-
- for (int i = 0; i < experiments.Length; i++)
- {
- string input = File.ReadAllText(Path.Combine(experiments[i].FullName, "input.txt"));
- string[] rawRegexes = File.ReadAllLines(Path.Combine(experiments[i].FullName, "regexes.txt"));
-
- WriteOutput($"\n---------- {experiments[i].Name} ----------");
- for (int r = 0; r < rawRegexes.Length; r++)
- {
- TestRunRegex((r + 1).ToString(), rawRegexes[r], input);
- }
- }
- }
-
private static long MeasureMatchTime(Regex re, string input, out Match match)
{
try
/// </summary>
private static string Not(string regex) => $"(?({regex})[0-[0]]|.*)";
- [ConditionalFact(nameof(Enabled))]
+ [Fact]
public void ViewSampleRegexInDGML()
{
+ if (!Enabled)
+ {
+ return;
+ }
+
try
{
//string rawregex = @"\bis\w*\b";
}
}
- private void TestRunRegex(string name, string rawregex, string input, bool viewDGML = false, bool dotStar = false)
- {
- var reNone = new Regex(rawregex, RegexOptions.None, new TimeSpan(0, 0, 10));
- var reCompiled = new Regex(rawregex, RegexOptions.Compiled, new TimeSpan(0, 0, 10));
- var reNonBacktracking = new Regex(rawregex, RegexOptions.NonBacktracking);
-
- if (viewDGML)
- ViewDGML(reNonBacktracking, addDotStar: dotStar);
- WriteOutput($"\n{name}");
-
- // First call in each case is a warmup
-
- // None
- MeasureMatchTime(reNone, input, out _);
- long tN = MeasureMatchTime(reNone, input, out Match mN);
- WriteMatchOutput(tN, mN);
-
- // Compiled
- MeasureMatchTime(reCompiled, input, out _);
- long tC = MeasureMatchTime(reCompiled, input, out Match mC);
- WriteMatchOutput(tC, mC);
-
- // Non-Backtracking
- MeasureMatchTime(reNonBacktracking, input, out _);
- long tD = MeasureMatchTime(reNonBacktracking, input, out Match mD);
- WriteMatchOutput(tD, mD);
-
- void WriteMatchOutput(long t, Match m)
- {
- WriteOutput(t switch
- {
- -1 => ",10000,TIMEOUT",
- -2 => ",10000,ERROR",
- _ when m.Success => $",{t},Yes({m.Index}:{m.Length})",
- _ => $",{t},No"
- });
- }
- }
-
#region Tests involving Intersection and Complement
// Currently only run in DEBUG mode in the NonBacktracking engine
[ConditionalFact(typeof(PlatformDetection), nameof(PlatformDetection.IsNetCore))]
[InlineData("(?i:abcde)|(?i:abcdf)", "(?i:abcd[ef])")]
[InlineData("xyz(?:(?i:abcde)|(?i:abcdf))", "xyz(?i:abcd[ef])")]
[InlineData("bonjour|hej|ciao|shalom|zdravo|pozdrav|hallo|hola|hello|hey|witam|tere|bonjou|salam|helo|sawubona", "(?>bonjou(?>r|)|h(?>e(?>j|(?>l(?>lo|o)|y))|allo|ola)|ciao|s(?>halom|a(?>lam|wubona))|zdravo|pozdrav|witam|tere)")]
+ [InlineData("\\w\\d123|\\w\\dabc", "\\w\\d(?:123|abc)")]
// Auto-atomicity
[InlineData("a*b", "(?>a*)b")]
[InlineData("a*b+", "(?>a*)b+")]
[InlineData("(?:w*)+\\.", "(?>w*)+\\.")]
[InlineData("(a[bcd]e*)*fg", "(a[bcd](?>e*))*fg")]
[InlineData("(\\w[bcd]\\s*)*fg", "(\\w[bcd](?>\\s*))*fg")]
+ // IgnoreCase set creation
+ [InlineData("(?i)abcd", "[Aa][Bb][Cc][Dd]")]
+ [InlineData("(?i)abcd|efgh", "[Aa][Bb][Cc][Dd]|[Ee][Ff][Gg][Hh]")]
+ [InlineData("(?i)a|b", "[AaBb]")]
+ [InlineData("(?i)[abcd]", "[AaBbCcDd]")]
+ [InlineData("(?i)[acexyz]", "[AaCcEeXxYyZz]")]
+ [InlineData("(?i)\\w", "\\w")]
+ [InlineData("(?i)\\d", "\\d")]
+ [InlineData("(?i).", ".")]
+ [InlineData("(?i)\\$", "\\$")]
public void PatternsReduceIdentically(string pattern1, string pattern2)
{
string result1 = GetRegexCodes(new Regex(pattern1));
}
Assert.NotEqual(GetRegexCodes(new Regex(pattern1, RegexOptions.RightToLeft)), GetRegexCodes(new Regex(pattern2)));
- if (!pattern1.Contains("?i:") && !pattern2.Contains("?i:"))
- {
- Assert.NotEqual(GetRegexCodes(new Regex(pattern1, RegexOptions.IgnoreCase)), GetRegexCodes(new Regex(pattern2)));
- }
}
[Theory]
// Not reducing branches of alternations with different casing
[InlineData("(?i:abcd)|abcd", "abcd|abcd")]
[InlineData("abcd|(?i:abcd)", "abcd|abcd")]
- [InlineData("abc(?:(?i:e)|f)", "abc[ef]")]
// Not applying auto-atomicity
[InlineData("a*b*", "(?>a*)b*")]
[InlineData("[ab]*[^a]", "(?>[ab]*)[^a]")]