Improve choice of IndexOfXx routine for some TryFindNextStartingPosition implementati...

author Stephen Toub <stoub@microsoft.com>

Tue, 18 Jul 2023 21:38:16 +0000 (17:38 -0400)

committer GitHub <noreply@github.com>

Tue, 18 Jul 2023 21:38:16 +0000 (17:38 -0400)
author Stephen Toub <stoub@microsoft.com>
Tue, 18 Jul 2023 21:38:16 +0000 (17:38 -0400)
committer GitHub <noreply@github.com>
Tue, 18 Jul 2023 21:38:16 +0000 (17:38 -0400)
diff --git a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs

index 021453e..e54ffd3 100644 (file)
--- a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs
+++ b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs
@@ -1063,25 +1063,49 @@ namespace System.Text.RegularExpressions.Generator
                          (true, _) => $"{span}.Slice(i + {primarySet.Distance})",
                      };
  
-                    Debug.Assert(!primarySet.Negated || (primarySet.Chars is null && primarySet.AsciiSet is null));
+                    // Get the IndexOf* expression to use to perform the search.
+                    string indexOf;
+                    if (primarySet.Chars is not null)
+                    {
+                        // We have a chars array, so we can use IndexOf{Any}{Except} to search for it. Choose the best overload.
+                        string indexOfName = "IndexOf", indexOfAnyName = "IndexOfAny";
+                        if (primarySet.Negated)
+                        {
+                            indexOfName = indexOfAnyName = "IndexOfAnyExcept";
+                        }
  
-                    string indexOf =
-                        primarySet.Chars is not null ? primarySet.Chars.Length switch
+                        indexOf = primarySet.Chars.Length switch
                          {
-                            1 => $"{span}.IndexOf({Literal(primarySet.Chars[0])})",
-                            2 => $"{span}.IndexOfAny({Literal(primarySet.Chars[0])}, {Literal(primarySet.Chars[1])})",
-                            3 => $"{span}.IndexOfAny({Literal(primarySet.Chars[0])}, {Literal(primarySet.Chars[1])}, {Literal(primarySet.Chars[2])})",
-                            _ => $"{span}.IndexOfAny({EmitSearchValuesOrLiteral(primarySet.Chars, requiredHelpers)})",
-                        } :
-                        primarySet.AsciiSet is not null ? $"{span}.IndexOfAny({EmitSearchValues(primarySet.AsciiSet, requiredHelpers)})" :
-                        primarySet.Range is not null ? (primarySet.Range.Value.LowInclusive == primarySet.Range.Value.HighInclusive, primarySet.Negated) switch
+                            1 => $"{span}.{indexOfName}({Literal(primarySet.Chars[0])})",
+                            2 => $"{span}.{indexOfAnyName}({Literal(primarySet.Chars[0])}, {Literal(primarySet.Chars[1])})",
+                            3 => $"{span}.{indexOfAnyName}({Literal(primarySet.Chars[0])}, {Literal(primarySet.Chars[1])}, {Literal(primarySet.Chars[2])})",
+                            _ => $"{span}.{indexOfAnyName}({EmitSearchValuesOrLiteral(primarySet.Chars, requiredHelpers)})",
+                        };
+                    }
+                    else if (primarySet.AsciiSet is not null)
+                    {
+                        // We have a set of ASCII chars, so we can use IndexOfAny(SearchValues) to search for it.
+                        Debug.Assert(!primarySet.Negated);
+                        indexOf = $"{span}.IndexOfAny({EmitSearchValues(primarySet.AsciiSet, requiredHelpers)})";
+                    }
+                    else if (primarySet.Range is not null)
+                    {
+                        // We have a range, so we can use IndexOfAny{Except}InRange to search for it.  In the corner case,
+                        // where we end up with a set of a single char, we can use IndexOf instead.
+                        indexOf = (primarySet.Range.Value.LowInclusive == primarySet.Range.Value.HighInclusive, primarySet.Negated) switch
                          {
                              (false, false) => $"{span}.IndexOfAnyInRange({Literal(primarySet.Range.Value.LowInclusive)}, {Literal(primarySet.Range.Value.HighInclusive)})",
                              (true, false) => $"{span}.IndexOf({Literal(primarySet.Range.Value.LowInclusive)})",
                              (false, true) => $"{span}.IndexOfAnyExceptInRange({Literal(primarySet.Range.Value.LowInclusive)}, {Literal(primarySet.Range.Value.HighInclusive)})",
                              (true, true) => $"{span}.IndexOfAnyExcept({Literal(primarySet.Range.Value.LowInclusive)})",
-                        } :
-                        $"{span}.{EmitIndexOfAnyCustomHelper(primarySet.Set, requiredHelpers, checkOverflow)}()";
+                        };
+                    }
+                    else
+                    {
+                        // We have an arbitrary set of characters that includes at least one non-ASCII char.  We use a custom IndexOfAny helper that
+                        // will perform the search as efficiently as possible.
+                        indexOf = $"{span}.{EmitIndexOfAnyCustomHelper(primarySet.Set, requiredHelpers, checkOverflow)}()";
+                    }
  
                      if (needLoop)
                      {
@@ -1184,6 +1208,7 @@ namespace System.Text.RegularExpressions.Generator
  
                  if (set.Chars is { Length: 1 })
                  {
+                    Debug.Assert(!set.Negated);
                      writer.WriteLine($"pos = inputSpan.Slice(0, pos).LastIndexOf({Literal(set.Chars[0])});");
                      using (EmitBlock(writer, "if (pos >= 0)"))
                      {
@@ -3307,7 +3332,7 @@ namespace System.Text.RegularExpressions.Generator
                  {
                      if (iterationCount is null &&
                          node.Kind is RegexNodeKind.Notonelazy &&
-                        subsequent?.FindStartingLiteral(4) is RegexNode.StartingLiteralData literal && // 5 == max optimized by IndexOfAny, and we need to reserve 1 for node.Ch
+                        subsequent?.FindStartingLiteral(4) is RegexNode.StartingLiteralData literal && // 5 == max efficiently optimized by IndexOfAny, and we need to reserve 1 for node.Ch
                          !literal.Negated && // not negated; can't search for both the node.Ch and a negated subsequent char with an IndexOf* method
                          (literal.String is not null ||
                           literal.SetChars is not null ||
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs

index afbad70..97e728a 100644 (file)
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs
@@ -901,8 +901,6 @@ namespace System.Text.RegularExpressions
                          Ldloc(textSpanLocal);
                      }
  
-                    Debug.Assert(!primarySet.Negated || (primarySet.Chars is null && primarySet.AsciiSet is null));
-
                      if (primarySet.Chars is not null)
                      {
                          switch (primarySet.Chars.Length)
@@ -910,14 +908,14 @@ namespace System.Text.RegularExpressions
                              case 1:
                                  // tmp = ...IndexOf(setChars[0]);
                                  Ldc(primarySet.Chars[0]);
-                                Call(s_spanIndexOfChar);
+                                Call(primarySet.Negated ? s_spanIndexOfAnyExceptChar : s_spanIndexOfChar);
                                  break;
  
                              case 2:
                                  // tmp = ...IndexOfAny(setChars[0], setChars[1]);
                                  Ldc(primarySet.Chars[0]);
                                  Ldc(primarySet.Chars[1]);
-                                Call(s_spanIndexOfAnyCharChar);
+                                Call(primarySet.Negated ? s_spanIndexOfAnyExceptCharChar : s_spanIndexOfAnyCharChar);
                                  break;
  
                              case 3:
@@ -925,13 +923,13 @@ namespace System.Text.RegularExpressions
                                  Ldc(primarySet.Chars[0]);
                                  Ldc(primarySet.Chars[1]);
                                  Ldc(primarySet.Chars[2]);
-                                Call(s_spanIndexOfAnyCharCharChar);
+                                Call(primarySet.Negated ? s_spanIndexOfAnyExceptCharCharChar : s_spanIndexOfAnyCharCharChar);
                                  break;
  
                              default:
                                  Ldstr(new string(primarySet.Chars));
                                  Call(s_stringAsSpanMethod);
-                                Call(s_spanIndexOfAnySpan);
+                                Call(primarySet.Negated ? s_spanIndexOfAnyExceptSpan : s_spanIndexOfAnySpan);
                                  break;
                          }
                      }
@@ -1166,6 +1164,8 @@ namespace System.Text.RegularExpressions
  
                  if (set.Chars is { Length: 1 })
                  {
+                    Debug.Assert(!set.Negated);
+
                      // pos = inputSpan.Slice(0, pos).LastIndexOf(set.Chars[0]);
                      Ldloca(inputSpan);
                      Ldc(0);
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexFindOptimizations.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexFindOptimizations.cs

index 113c074..7597b37 100644 (file)
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexFindOptimizations.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexFindOptimizations.cs
@@ -3,7 +3,6 @@
  
  using System.Collections.Generic;
  using System.Diagnostics;
-using System.Globalization;
  
  namespace System.Text.RegularExpressions
  {
@@ -95,7 +94,7 @@ namespace System.Text.RegularExpressions
                  if (RegexPrefixAnalyzer.FindFirstCharClass(root) is string charClass)
                  {
                      // See if the set is limited to holding only a few characters.
-                    Span<char> scratch = stackalloc char[5]; // max optimized by IndexOfAny today
+                    Span<char> scratch = stackalloc char[5]; // max efficiently optimized by IndexOfAny today
                      int scratchCount;
                      char[]? chars = null;
                      if (!RegexCharClass.IsNegated(charClass) &&
@@ -109,12 +108,14 @@ namespace System.Text.RegularExpressions
                      {
                          // The set contains one and only one character, meaning every match starts
                          // with the same literal value (potentially case-insensitive). Search for that.
+                        Debug.Assert(!RegexCharClass.IsNegated(charClass));
                          FixedDistanceLiteral = (chars[0], null, 0);
                          FindMode = FindNextStartingPositionMode.LeadingChar_RightToLeft;
                      }
                      else
                      {
                          // The set may match multiple characters.  Search for that.
+                        Debug.Assert(!RegexCharClass.IsNegated(charClass) || chars is null);
                          FixedDistanceSets = new List<FixedDistanceSet>()
                          {
                              new FixedDistanceSet(chars, charClass, 0)
@@ -154,22 +155,32 @@ namespace System.Text.RegularExpressions
  
              // As a backup, see if we can find a literal after a leading atomic loop.  That might be better than whatever sets we find, so
              // we want to know whether we have one in our pocket before deciding whether to use a leading set (we'll prefer a leading
-            // set if it's something for which we can vectorize a search).
+            // set if it's something for which we can search efficiently).
              (RegexNode LoopNode, (char Char, string? String, char[]? Chars) Literal)? literalAfterLoop = RegexPrefixAnalyzer.FindLiteralFollowingLeadingLoop(root);
  
-            // If we got such sets, we'll likely use them.  However, if the best of them is something that doesn't support a vectorized
-            // search and we did successfully find a literal after an atomic loop we could search instead, we prefer the vectorizable search.
+            // If we got such sets, we'll likely use them.  However, if the best of them is something that doesn't support an efficient
+            // search and we did successfully find a literal after an atomic loop we could search instead, we prefer the efficient search.
+            // For example, if we have a negated set, we will still prefer the literal-after-an-atomic-loop because negated sets typically
+            // contain _many_ characters (e.g. [^a] is everything but 'a') and are thus more likely to very quickly match, which means any
+            // vectorization employed is less likely to kick in and be worth the startup overhead.
              if (fixedDistanceSets is not null)
              {
+                // Sort the sets by "quality", such that whatever set is first is the one deemed most efficient to use.
+                // In some searches, we may use multiple sets, so we want the subsequent ones to also be the efficiency runners-up.
                  RegexPrefixAnalyzer.SortFixedDistanceSetsByQuality(fixedDistanceSets);
-                if (fixedDistanceSets[0].Chars is not null || literalAfterLoop is null)
+
+                // If there is no literal after the loop, use whatever set we got.
+                // If there is a literal after the loop, consider it to be better than a negated set and better than a set with many characters.
+                if (literalAfterLoop is null ||
+                    (fixedDistanceSets[0].Chars is not null && !fixedDistanceSets[0].Negated))
                  {
                      // Determine whether to do searching based on one or more sets or on a single literal. Compiled engines
                      // don't need to special-case literals as they already do codegen to create the optimal lookup based on
                      // the set's characteristics.
                      if (!compiled &&
                          fixedDistanceSets.Count == 1 &&
-                        fixedDistanceSets[0].Chars is { Length: 1 })
+                        fixedDistanceSets[0].Chars is { Length: 1 } &&
+                        !fixedDistanceSets[0].Negated)
                      {
                          FixedDistanceLiteral = (fixedDistanceSets[0].Chars![0], null, fixedDistanceSets[0].Distance);
                          FindMode = FindNextStartingPositionMode.FixedDistanceChar_LeftToRight;
@@ -186,8 +197,9 @@ namespace System.Text.RegularExpressions
  
                          // Store the sets, and compute which mode to use.
                          FixedDistanceSets = fixedDistanceSets;
-                        FindMode = (fixedDistanceSets.Count == 1 && fixedDistanceSets[0].Distance == 0) ? FindNextStartingPositionMode.LeadingSet_LeftToRight
-                            : FindNextStartingPositionMode.FixedDistanceSets_LeftToRight;
+                        FindMode = (fixedDistanceSets.Count == 1 && fixedDistanceSets[0].Distance == 0) ?
+                            FindNextStartingPositionMode.LeadingSet_LeftToRight :
+                            FindNextStartingPositionMode.FixedDistanceSets_LeftToRight;
                          _asciiLookups = new uint[fixedDistanceSets.Count][];
                      }
                      return;
@@ -322,6 +334,7 @@ namespace System.Text.RegularExpressions
              return best;
          }
  
+#if SYSTEM_TEXT_REGULAREXPRESSIONS
          /// <summary>Try to advance to the next starting position that might be a location for a match.</summary>
          /// <param name="textSpan">The text to search.</param>
          /// <param name="pos">The position in <paramref name="textSpan"/>.  This is updated with the found position.</param>
@@ -578,12 +591,11 @@ namespace System.Text.RegularExpressions
                      {
                          FixedDistanceSet primarySet = FixedDistanceSets![0];
                          char[]? chars = primarySet.Chars;
-                        string set = primarySet.Set;
  
                          ReadOnlySpan<char> span = textSpan.Slice(pos);
                          if (chars is not null)
                          {
-                            int i = span.IndexOfAny(chars);
+                            int i = primarySet.Negated ? span.IndexOfAnyExcept(chars) : span.IndexOfAny(chars);
                              if (i >= 0)
                              {
                                  pos += i;
@@ -595,7 +607,7 @@ namespace System.Text.RegularExpressions
                              ref uint[]? startingAsciiLookup = ref _asciiLookups![0];
                              for (int i = 0; i < span.Length; i++)
                              {
-                                if (RegexCharClass.CharInClass(span[i], set, ref startingAsciiLookup))
+                                if (RegexCharClass.CharInClass(span[i], primarySet.Set, ref startingAsciiLookup))
                                  {
                                      pos += i;
                                      return true;
@@ -653,7 +665,8 @@ namespace System.Text.RegularExpressions
                              for (int inputPosition = pos; inputPosition <= endMinusRequiredLength; inputPosition++)
                              {
                                  int offset = inputPosition + primarySet.Distance;
-                                int index = textSpan.Slice(offset).IndexOfAny(primarySet.Chars);
+                                ReadOnlySpan<char> textSpanAtOffset = textSpan.Slice(offset);
+                                int index = primarySet.Negated ? textSpanAtOffset.IndexOfAnyExcept(primarySet.Chars) : textSpanAtOffset.IndexOfAny(primarySet.Chars);
                                  if (index < 0)
                                  {
                                      break;
@@ -769,6 +782,7 @@ namespace System.Text.RegularExpressions
                      return true;
              }
          }
+#endif
      }
  
      /// <summary>Mode to use for searching for the next location of a possible match.</summary>
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs

index 7cf3b9f..e579ad1 100644 (file)
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs
@@ -1419,7 +1419,7 @@ namespace System.Text.RegularExpressions
          /// A tuple of data about the literal: only one of the Char/String/SetChars fields is relevant.
          /// The Negated value indicates whether the Char/SetChars should be considered exclusionary.
          /// </returns>
-        public StartingLiteralData? FindStartingLiteral(int maxSetCharacters = 5) // 5 is max optimized by IndexOfAny today
+        public StartingLiteralData? FindStartingLiteral(int maxSetCharacters = 5) // 5 is max efficiently optimized by IndexOfAny today
          {
              Debug.Assert(maxSetCharacters >= 0 && maxSetCharacters <= 128, $"{nameof(maxSetCharacters)} == {maxSetCharacters} should be small enough to be stack allocated.");
  
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexPrefixAnalyzer.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexPrefixAnalyzer.cs

index 88553b3..56ff64c 100644 (file)
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexPrefixAnalyzer.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexPrefixAnalyzer.cs
@@ -3,7 +3,6 @@
  
  using System.Collections.Generic;
  using System.Diagnostics;
-using System.Globalization;
  using System.Runtime.CompilerServices;
  using System.Threading;
  
@@ -192,7 +191,12 @@ namespace System.Text.RegularExpressions
              // Find all fixed-distance sets.
              var results = new List<RegexFindOptimizations.FixedDistanceSet>();
              int distance = 0;
-            TryFindFixedSets(root, results, ref distance, thorough);
+            TryFindRawFixedSets(root, results, ref distance, thorough);
+#if DEBUG
+            results.ForEach(r => Debug.Assert(
+                !r.Negated && r.Chars is null && r.AsciiSet is null && r.Range is null,
+                $"{nameof(TryFindRawFixedSets)} should have only populated {nameof(r.Set)} and {nameof(r.Distance)}"));
+#endif
  
              // Remove any sets that match everything; they're not helpful.  (This check exists primarily to weed
              // out use of . in Singleline mode, but also filters out explicit sets like [\s\S].)
@@ -221,7 +225,7 @@ namespace System.Text.RegularExpressions
  
              // For every entry, try to get the chars that make up the set, if there are few enough.
              // For any for which we couldn't get the small chars list, see if we can get other useful info.
-            Span<char> scratch = stackalloc char[5]; // max optimized by IndexOfAny today
+            Span<char> scratch = stackalloc char[5]; // max efficiently optimized by IndexOfAny today
              for (int i = 0; i < results.Count; i++)
              {
                  RegexFindOptimizations.FixedDistanceSet result = results[i];
@@ -229,7 +233,7 @@ namespace System.Text.RegularExpressions
  
                  int count = RegexCharClass.GetSetChars(result.Set, scratch);
  
-                if (!result.Negated && count > 0)
+                if (count > 0)
                  {
                      result.Chars = scratch.Slice(0, count).ToArray();
                  }
@@ -258,8 +262,10 @@ namespace System.Text.RegularExpressions
              // is at a fixed distance, in which case distance will have been updated to include the full length
              // of the node.  If it returns false, the node isn't entirely fixed, in which case subsequent nodes
              // shouldn't be examined and distance should no longer be trusted.  However, regardless of whether it
-            // returns true or false, it may have populated results, and all populated results are valid.
-            static bool TryFindFixedSets(RegexNode node, List<RegexFindOptimizations.FixedDistanceSet> results, ref int distance, bool thorough)
+            // returns true or false, it may have populated results, and all populated results are valid. All
+            // FixedDistanceSet result will only have its Set string and Distance populated; the rest is left
+            // to be populated by FindFixedDistanceSets after this returns.
+            static bool TryFindRawFixedSets(RegexNode node, List<RegexFindOptimizations.FixedDistanceSet> results, ref int distance, bool thorough)
              {
                  if (!StackHelper.TryEnsureSufficientExecutionStack())
                  {
@@ -357,7 +363,7 @@ namespace System.Text.RegularExpressions
                      case RegexNodeKind.Atomic:
                      case RegexNodeKind.Group:
                      case RegexNodeKind.Capture:
-                        return TryFindFixedSets(node.Child(0), results, ref distance, thorough);
+                        return TryFindRawFixedSets(node.Child(0), results, ref distance, thorough);
  
                      case RegexNodeKind.Lazyloop or RegexNodeKind.Loop when node.M > 0:
                          // This effectively only iterates the loop once.  If deemed valuable,
@@ -366,7 +372,7 @@ namespace System.Text.RegularExpressions
                          // summed distance for all node.M iterations.  If node.M == node.N,
                          // this would then also allow continued evaluation of the rest of the
                          // expression after the loop.
-                        TryFindFixedSets(node.Child(0), results, ref distance, thorough);
+                        TryFindRawFixedSets(node.Child(0), results, ref distance, thorough);
                          return false;
  
                      case RegexNodeKind.Concatenate:
@@ -374,7 +380,7 @@ namespace System.Text.RegularExpressions
                              int childCount = node.ChildCount();
                              for (int i = 0; i < childCount; i++)
                              {
-                                if (!TryFindFixedSets(node.Child(i), results, ref distance, thorough))
+                                if (!TryFindRawFixedSets(node.Child(i), results, ref distance, thorough))
                                  {
                                      return false;
                                  }
@@ -394,7 +400,7 @@ namespace System.Text.RegularExpressions
                              {
                                  localResults.Clear();
                                  int localDistance = 0;
-                                allSameSize &= TryFindFixedSets(node.Child(i), localResults, ref localDistance, thorough);
+                                allSameSize &= TryFindRawFixedSets(node.Child(i), localResults, ref localDistance, thorough);
  
                                  if (localResults.Count == 0)
                                  {
@@ -475,64 +481,75 @@ namespace System.Text.RegularExpressions
                  int s1RangeLength = s1.Range is not null ? GetRangeLength(s1.Range.Value, s1Negated) : 0;
                  int s2RangeLength = s2.Range is not null ? GetRangeLength(s2.Range.Value, s2Negated) : 0;
  
-                Debug.Assert(!s1Negated || s1Chars is null);
-                Debug.Assert(!s2Negated || s2Chars is null);
+                // If one set is negated and the other isn't, prefer the non-negated set. In general, negated
+                // sets are large and thus likely to match more frequently, making them slower to search for.
+                if (s1Negated != s2Negated)
+                {
+                    return s2Negated ? -1 : 1;
+                }
  
-                // If both have chars, prioritize the one with the smaller frequency for those chars.
-                if (s1Chars is not null && s2Chars is not null)
+                // If we extracted only a few chars and the sets are negated, they both represent very large
+                // sets that are difficult to compare for quality.
+                if (!s1Negated)
                  {
-                    // Prefer sets with less frequent values.  The frequency is only an approximation,
-                    // used as a tie-breaker when we'd otherwise effectively be picking randomly.
-                    // True frequencies will vary widely based on the actual data being searched, the language of the data, etc.
-                    float s1Frequency = SumFrequencies(s1Chars);
-                    float s2Frequency = SumFrequencies(s2Chars);
+                    Debug.Assert(!s2Negated);
  
-                    if (s1Frequency != s2Frequency)
+                    // If both have chars, prioritize the one with the smaller frequency for those chars.
+                    if (s1Chars is not null && s2Chars is not null)
                      {
-                        return s1Frequency.CompareTo(s2Frequency);
-                    }
+                        // Prefer sets with less frequent values.  The frequency is only an approximation,
+                        // used as a tie-breaker when we'd otherwise effectively be picking randomly.
+                        // True frequencies will vary widely based on the actual data being searched, the language of the data, etc.
+                        float s1Frequency = SumFrequencies(s1Chars);
+                        float s2Frequency = SumFrequencies(s2Chars);
  
-                    if (!RegexCharClass.IsAscii(s1Chars) && !RegexCharClass.IsAscii(s2Chars))
-                    {
-                        // Prefer the set with fewer values.
-                        return s1CharsLength.CompareTo(s2CharsLength);
-                    }
+                        if (s1Frequency != s2Frequency)
+                        {
+                            return s1Frequency.CompareTo(s2Frequency);
+                        }
  
-                    [MethodImpl(MethodImplOptions.AggressiveInlining)]
-                    static float SumFrequencies(char[] chars)
-                    {
-                        float sum = 0;
-                        foreach (char c in chars)
+                        if (!RegexCharClass.IsAscii(s1Chars) && !RegexCharClass.IsAscii(s2Chars))
+                        {
+                            // Prefer the set with fewer values.
+                            return s1CharsLength.CompareTo(s2CharsLength);
+                        }
+
+                        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+                        static float SumFrequencies(char[] chars)
                          {
-                            // Lookup each character in the table.  Values >= 128 are ignored
-                            // and thus we'll get skew in the data.  It's already a gross approximation, though,
-                            // and it is primarily meant for disambiguation of ASCII letters.
-                            if (c < 128)
+                            float sum = 0;
+                            foreach (char c in chars)
                              {
-                                sum += Frequency[c];
+                                // Lookup each character in the table.  Values >= 128 are ignored
+                                // and thus we'll get skew in the data.  It's already a gross approximation, though,
+                                // and it is primarily meant for disambiguation of ASCII letters.
+                                if (c < 128)
+                                {
+                                    sum += Frequency[c];
+                                }
                              }
+                            return sum;
                          }
-                        return sum;
                      }
-                }
  
-                // If one has chars and the other has a range, prefer the shorter set.
-                if ((s1CharsLength > 0 && s2RangeLength > 0) || (s1RangeLength > 0 && s2CharsLength > 0))
-                {
-                    int c = Math.Max(s1CharsLength, s1RangeLength).CompareTo(Math.Max(s2CharsLength, s2RangeLength));
-                    if (c != 0)
+                    // If one has chars and the other has a range, prefer the shorter set.
+                    if ((s1CharsLength > 0 && s2RangeLength > 0) || (s1RangeLength > 0 && s2CharsLength > 0))
                      {
-                        return c;
-                    }
+                        int c = Math.Max(s1CharsLength, s1RangeLength).CompareTo(Math.Max(s2CharsLength, s2RangeLength));
+                        if (c != 0)
+                        {
+                            return c;
+                        }
  
-                    // If lengths are the same, prefer the chars.
-                    return s1CharsLength > 0 ? -1 : 1;
-                }
+                        // If lengths are the same, prefer the chars.
+                        return s1CharsLength > 0 ? -1 : 1;
+                    }
  
-                // If one has chars and the other doesn't, prioritize the one with chars.
-                if ((s1CharsLength > 0) != (s2CharsLength > 0))
-                {
-                    return s1CharsLength > 0 ? -1 : 1;
+                    // If one has chars and the other doesn't, prioritize the one with chars.
+                    if ((s1CharsLength > 0) != (s2CharsLength > 0))
+                    {
+                        return s1CharsLength > 0 ? -1 : 1;
+                    }
                  }
  
                  // If one has a range and the other doesn't, prioritize the one with a range.
author	Stephen Toub <stoub@microsoft.com>
	Tue, 18 Jul 2023 21:38:16 +0000 (17:38 -0400)
committer	GitHub <noreply@github.com>
	Tue, 18 Jul 2023 21:38:16 +0000 (17:38 -0400)
src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs		patch \| blob \| history
src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs		patch \| blob \| history
src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexFindOptimizations.cs		patch \| blob \| history
src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs		patch \| blob \| history
src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexPrefixAnalyzer.cs		patch \| blob \| history