Factor common prefix text out of Regex alternations (#2171)

author Stephen Toub <stoub@microsoft.com>

Mon, 27 Jan 2020 19:56:04 +0000 (14:56 -0500)

committer GitHub <noreply@github.com>

Mon, 27 Jan 2020 19:56:04 +0000 (14:56 -0500)
author Stephen Toub <stoub@microsoft.com>
Mon, 27 Jan 2020 19:56:04 +0000 (14:56 -0500)
committer GitHub <noreply@github.com>
Mon, 27 Jan 2020 19:56:04 +0000 (14:56 -0500)
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.cs

index 05116c6..81e4a51 100644 (file)
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.cs
@@ -105,16 +105,7 @@ namespace System.Text.RegularExpressions
  #if DEBUG
              if (IsDebug)
              {
-                Debug.Write($"Pattern:     {pattern}");
-                RegexOptions displayOptions = options & ~RegexOptions.Debug;
-                if (displayOptions != RegexOptions.None)
-                {
-                    Debug.Write($"Options:     {displayOptions}");
-                }
-                if (matchTimeout != InfiniteMatchTimeout)
-                {
-                    Debug.Write($"Timeout:     {matchTimeout}");
-                }
+                Debug.WriteLine($"Pattern: {pattern}    Options: {options & ~RegexOptions.Debug}    Timeout: {(matchTimeout == InfiniteMatchTimeout ? "infinite" : matchTimeout.ToString())}");
              }
  #endif
  
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexBoyerMoore.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexBoyerMoore.cs

index 99b5825..ffcf168 100644 (file)
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexBoyerMoore.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexBoyerMoore.cs
@@ -339,37 +339,35 @@ namespace System.Text.RegularExpressions
          }
  
  #if DEBUG
-        /// <summary>
-        /// Used when dumping for debugging.
-        /// </summary>
+        /// <summary>Used when dumping for debugging.</summary>
          [ExcludeFromCodeCoverage]
          public override string ToString() => Pattern;
  
          [ExcludeFromCodeCoverage]
          public string Dump(string indent)
          {
-            StringBuilder sb = new StringBuilder();
+            var sb = new StringBuilder();
  
              sb.AppendLine($"{indent}BM Pattern: {Pattern}");
-            sb.Append(indent + "Positive: ");
-            for (int i = 0; i < Positive.Length; i++)
+
+            sb.Append($"{indent}Positive: ");
+            foreach (int i in Positive)
              {
-                sb.Append(Positive[i].ToString(CultureInfo.InvariantCulture) + " ");
+                sb.Append($"{i} ");
              }
              sb.AppendLine();
  
              if (NegativeASCII != null)
              {
-                sb.Append(indent + "Negative table: ");
+                sb.Append($"{indent}Negative table: ");
                  for (int i = 0; i < NegativeASCII.Length; i++)
                  {
                      if (NegativeASCII[i] != Pattern.Length)
                      {
-                        sb.Append(" {" + Regex.Escape(Convert.ToString((char)i, CultureInfo.InvariantCulture)) + " " + NegativeASCII[i].ToString(CultureInfo.InvariantCulture) + "}");
+                        sb.Append($" {{{Regex.Escape(((char)i).ToString())} {NegativeASCII[i]}}}");
                      }
                  }
              }
-
              sb.AppendLine();
  
              return sb.ToString();
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs

index 76288af..5d5f8be 100644 (file)
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs
@@ -735,7 +735,7 @@ namespace System.Text.RegularExpressions
              return set[SetStartIndex];
          }
  
-        public static bool IsMergeable(string? charClass) =>
+        public static bool IsMergeable(string charClass) =>
              charClass != null &&
              !IsNegated(charClass) &&
              !IsSubtraction(charClass);
@@ -1541,7 +1541,7 @@ namespace System.Text.RegularExpressions
              int categoryLength = set[CategoryLengthIndex];
              int endPosition = SetStartIndex + setLength + categoryLength;
  
-            StringBuilder desc = new StringBuilder();
+            var desc = new StringBuilder();
  
              desc.Append('[');
  
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCode.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCode.cs

index 1e894f1..aa4dae5 100644 (file)
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCode.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCode.cs
@@ -285,7 +285,7 @@ namespace System.Text.RegularExpressions
          [ExcludeFromCodeCoverage]
          public string OpcodeDescription(int offset)
          {
-            StringBuilder sb = new StringBuilder();
+            var sb = new StringBuilder();
              int opcode = Codes[offset];
  
              sb.AppendFormat("{0:D6} ", offset);
@@ -307,7 +307,7 @@ namespace System.Text.RegularExpressions
                  case Notoneloopatomic:
                  case Onelazy:
                  case Notonelazy:
-                    sb.Append(RegexCharClass.CharDescription((char)Codes[offset + 1]));
+                    sb.Append("'").Append(RegexCharClass.CharDescription((char)Codes[offset + 1])).Append("'");
                      break;
  
                  case Set:
@@ -319,7 +319,7 @@ namespace System.Text.RegularExpressions
                      break;
  
                  case Multi:
-                    sb.Append(Strings[Codes[offset + 1]]);
+                    sb.Append('"').Append(Strings[Codes[offset + 1]]).Append('"');
                      break;
  
                  case Ref:
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs

index 98603a1..a5fe30e 100644 (file)
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs
@@ -1778,7 +1778,7 @@ namespace System.Text.RegularExpressions
                          // Its children must all also be supported.
                          case RegexNode.Alternate:
                              if (node.Next != null &&
-                                (node.Next.Type == RegexNode.Atomic || // atomic alternate
+                                (node.IsAtomicByParent() || // atomic alternate
                                  (node.Next.Type == RegexNode.Capture && node.Next.Next is null))) // root alternate
                              {
                                  goto case RegexNode.Concatenate;
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexFCD.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexFCD.cs

index 82c140a..d8a7495 100644 (file)
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexFCD.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexFCD.cs
@@ -227,29 +227,20 @@ namespace System.Text.RegularExpressions
          [ExcludeFromCodeCoverage]
          public static string AnchorDescription(int anchors)
          {
-            StringBuilder sb = new StringBuilder();
-
-            if (0 != (anchors & Beginning))
-                sb.Append(", Beginning");
-            if (0 != (anchors & Start))
-                sb.Append(", Start");
-            if (0 != (anchors & Bol))
-                sb.Append(", Bol");
-            if (0 != (anchors & Boundary))
-                sb.Append(", Boundary");
-            if (0 != (anchors & ECMABoundary))
-                sb.Append(", ECMABoundary");
-            if (0 != (anchors & Eol))
-                sb.Append(", Eol");
-            if (0 != (anchors & End))
-                sb.Append(", End");
-            if (0 != (anchors & EndZ))
-                sb.Append(", EndZ");
-
-            if (sb.Length >= 2)
-                return (sb.ToString(2, sb.Length - 2));
-
-            return "None";
+            var sb = new StringBuilder();
+
+            if ((anchors & Beginning) != 0) sb.Append(", Beginning");
+            if ((anchors & Start) != 0) sb.Append(", Start");
+            if ((anchors & Bol) != 0) sb.Append(", Bol");
+            if ((anchors & Boundary) != 0) sb.Append(", Boundary");
+            if ((anchors & ECMABoundary) != 0) sb.Append(", ECMABoundary");
+            if ((anchors & Eol) != 0) sb.Append(", Eol");
+            if ((anchors & End) != 0) sb.Append(", End");
+            if ((anchors & EndZ) != 0) sb.Append(", EndZ");
+
+            return sb.Length >= 2 ?
+                sb.ToString(2, sb.Length - 2) :
+                "None";
          }
  #endif
  
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs

index 06cca89..5c656c6 100644 (file)
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs
@@ -227,9 +227,13 @@ namespace System.Text.RegularExpressions
                                      case Alternate:
                                      case Loop:
                                      case Lazyloop:
-                                        var atomic = new RegexNode(Atomic, Options);
-                                        atomic.AddChild(existingChild);
-                                        node.ReplaceChild(node.ChildCount() - 1, atomic);
+                                        // Make the node atomic if it isn't already (as conferred by a parent node being atomic).
+                                        if (!existingChild.IsAtomicByParent())
+                                        {
+                                            var atomic = new RegexNode(Atomic, Options);
+                                            atomic.AddChild(existingChild);
+                                            node.ReplaceChild(node.ChildCount() - 1, atomic);
+                                        }
                                          break;
                                  }
                                  continue;
@@ -299,7 +303,7 @@ namespace System.Text.RegularExpressions
              // Optimization: Unnecessary root atomic.
              // If the root node under the implicit Capture is an Atomic, the Atomic is useless as there's nothing
              // to backtrack into it, so we can remove it.
-            if (rootNode.Child(0).Type == Atomic)
+            while (rootNode.Child(0).Type == Atomic)
              {
                  rootNode.ReplaceChild(0, rootNode.Child(0).Child(0));
              }
@@ -308,6 +312,29 @@ namespace System.Text.RegularExpressions
              return rootNode;
          }
  
+        /// <summary>Whether this node is considered to be atomic based on its parent.</summary>
+        /// <remarks>
+        /// This is used to determine whether additional atomic nodes may be valuable to
+        /// be introduced into the tree.  It should not be used to determine for sure whether
+        /// a node will be backtracked into.
+        /// </remarks>
+        public bool IsAtomicByParent()
+        {
+            RegexNode? next = Next;
+            if (next is null) return false;
+            if (next.Type == Atomic) return true;
+
+            // We only walk up one group as a balance between optimization and cost.
+            if ((next.Type != Concatenate && next.Type != Capture) ||
+                next.Child(next.ChildCount() - 1) != this)
+            {
+                return false;
+            }
+
+            next = next.Next;
+            return next != null && next.Type == Atomic;
+        }
+
          /// <summary>
          /// Removes redundant nodes from the subtree, and returns a reduced subtree.
          /// </summary>
@@ -545,137 +572,321 @@ namespace System.Text.RegularExpressions
              return this;
          }
  
-        /// <summary>
-        /// Combine adjacent sets/chars.
-        /// Basic optimization. Single-letter alternations can be replaced
-        /// by faster set specifications, and nested alternations with no
-        /// intervening operators can be flattened:
-        ///
-        /// a|b|c|def|g|h -> [a-c]|def|[gh]
-        /// apple|(?:orange|pear)|grape -> apple|orange|pear|grape
-        /// </summary>
+        /// <summary>Optimize an alternation.</summary>
          private RegexNode ReduceAlternation()
          {
-            int childCount = ChildCount();
-            if (childCount == 0)
+            switch (ChildCount())
              {
-                return new RegexNode(Nothing, Options);
-            }
+                case 0:
+                    return new RegexNode(Nothing, Options);
  
-            if (childCount == 1)
-            {
-                return Child(0);
-            }
+                case 1:
+                    return Child(0);
  
-            bool wasLastSet = false;
-            bool lastNodeCannotMerge = false;
-            RegexOptions optionsLast = 0;
-            RegexOptions optionsAt;
-            int i;
-            int j;
-            RegexNode at;
-            RegexNode prev;
+                default:
+                    ReduceSingleLetterAndNestedAlternations();
+                    RegexNode newThis = StripEnation(Nothing);
+                    return newThis != this ? newThis : ExtractCommonPrefix();
+            }
  
-            List<RegexNode> children = (List<RegexNode>)Children!;
-            for (i = 0, j = 0; i < children.Count; i++, j++)
+            // This function performs two optimizations:
+            // - Single-letter alternations can be replaced by faster set specifications
+            //   e.g. "a|b|c|def|g|h" -> "[a-c]|def|[gh]"
+            // - Nested alternations with no intervening operators can be flattened:
+            //   e.g. "apple|(?:orange|pear)|grape" -> "apple|orange|pear|grape"
+            void ReduceSingleLetterAndNestedAlternations()
              {
-                at = children[i];
+                bool wasLastSet = false;
+                bool lastNodeCannotMerge = false;
+                RegexOptions optionsLast = 0;
+                RegexOptions optionsAt;
+                int i;
+                int j;
+                RegexNode at;
+                RegexNode prev;
+
+                List<RegexNode> children = (List<RegexNode>)Children!;
+                for (i = 0, j = 0; i < children.Count; i++, j++)
+                {
+                    at = children[i];
  
-                if (j < i)
-                    children[j] = at;
+                    if (j < i)
+                        children[j] = at;
  
-                while (true)
-                {
-                    if (at.Type == Alternate)
+                    while (true)
                      {
-                        if (at.Children is List<RegexNode> atChildren)
+                        if (at.Type == Alternate)
                          {
-                            for (int k = 0; k < atChildren.Count; k++)
+                            if (at.Children is List<RegexNode> atChildren)
+                            {
+                                for (int k = 0; k < atChildren.Count; k++)
+                                {
+                                    atChildren[k].Next = this;
+                                }
+                                children.InsertRange(i + 1, atChildren);
+                            }
+                            else
                              {
-                                atChildren[k].Next = this;
+                                RegexNode atChild = (RegexNode)at.Children!;
+                                atChild.Next = this;
+                                children.Insert(i + 1, atChild);
                              }
-                            children.InsertRange(i + 1, atChildren);
+                            j--;
                          }
-                        else
+                        else if (at.Type == Set || at.Type == One)
                          {
-                            RegexNode atChild = (RegexNode)at.Children!;
-                            atChild.Next = this;
-                            children.Insert(i + 1, atChild);
-                        }
-                        j--;
-                    }
-                    else if (at.Type == Set || at.Type == One)
-                    {
-                        // Cannot merge sets if L or I options differ, or if either are negated.
-                        optionsAt = at.Options & (RegexOptions.RightToLeft | RegexOptions.IgnoreCase);
+                            // Cannot merge sets if L or I options differ, or if either are negated.
+                            optionsAt = at.Options & (RegexOptions.RightToLeft | RegexOptions.IgnoreCase);
  
-                        if (at.Type == Set)
-                        {
-                            if (!wasLastSet || optionsLast != optionsAt || lastNodeCannotMerge || !RegexCharClass.IsMergeable(at.Str))
+                            if (at.Type == Set)
+                            {
+                                if (!wasLastSet || optionsLast != optionsAt || lastNodeCannotMerge || !RegexCharClass.IsMergeable(at.Str!))
+                                {
+                                    wasLastSet = true;
+                                    lastNodeCannotMerge = !RegexCharClass.IsMergeable(at.Str!);
+                                    optionsLast = optionsAt;
+                                    break;
+                                }
+                            }
+                            else if (!wasLastSet || optionsLast != optionsAt || lastNodeCannotMerge)
                              {
                                  wasLastSet = true;
-                                lastNodeCannotMerge = !RegexCharClass.IsMergeable(at.Str);
+                                lastNodeCannotMerge = false;
                                  optionsLast = optionsAt;
                                  break;
                              }
+
+
+                            // The last node was a Set or a One, we're a Set or One and our options are the same.
+                            // Merge the two nodes.
+                            j--;
+                            prev = children[j];
+
+                            RegexCharClass prevCharClass;
+                            if (prev.Type == One)
+                            {
+                                prevCharClass = new RegexCharClass();
+                                prevCharClass.AddChar(prev.Ch);
+                            }
+                            else
+                            {
+                                prevCharClass = RegexCharClass.Parse(prev.Str!);
+                            }
+
+                            if (at.Type == One)
+                            {
+                                prevCharClass.AddChar(at.Ch);
+                            }
+                            else
+                            {
+                                RegexCharClass atCharClass = RegexCharClass.Parse(at.Str!);
+                                prevCharClass.AddCharClass(atCharClass);
+                            }
+
+                            prev.Type = Set;
+                            prev.Str = prevCharClass.ToStringClass();
+                        }
+                        else if (at.Type == Nothing)
+                        {
+                            j--;
                          }
-                        else if (!wasLastSet || optionsLast != optionsAt || lastNodeCannotMerge)
+                        else
                          {
-                            wasLastSet = true;
+                            wasLastSet = false;
                              lastNodeCannotMerge = false;
-                            optionsLast = optionsAt;
-                            break;
                          }
+                        break;
+                    }
+                }
+
+                if (j < i)
+                {
+                    children.RemoveRange(j, i - j);
+                }
+            }
  
+            // Analyzes all the branches of the alternation for text that's identical at the beginning
+            // of every branch.  That text is then pulled out into its own one or multi node in a
+            // concatenation with the alternation (whose branches are updated to remove that prefix).
+            // This is valuable for a few reasons.  One, it exposes potentially more text to the
+            // expression prefix analyzer used to influence FindFirstChar.  Second, it exposes more
+            // potential alternation optimizations, e.g. if the same prefix is followed in two branches
+            // by sets that can be merged.  Third, it reduces the amount of duplicated comparisons required
+            // if we end up backtracking into subsequent branches.
+            RegexNode ExtractCommonPrefix()
+            {
+                // To keep things relatively simple, we currently only handle:
+                // - Branches that are one or multi nodes, or that are concatenations beginning with one or multi nodes.
+                // - All branches having the same options.
+                // - Text, rather than also trying to combine identical sets that start each branch.
+
+                Debug.Assert(Children is List<RegexNode>);
+                var children = (List<RegexNode>)Children;
+                Debug.Assert(children.Count >= 2);
+
+                // Process the first branch to get the maximum possible common string.
+                RegexNode? startingNode = FindBranchOneMultiStart(children[0]);
+                if (startingNode is null)
+                {
+                    return this;
+                }
  
-                        // The last node was a Set or a One, we're a Set or One and our options are the same.
-                        // Merge the two nodes.
-                        j--;
-                        prev = children[j];
+                RegexOptions startingNodeOptions = startingNode.Options;
+                string? originalStartingString = startingNode.Str;
+                ReadOnlySpan<char> startingSpan = startingNode.Type == One ? stackalloc char[1] { startingNode.Ch } : (ReadOnlySpan<char>)originalStartingString;
+                Debug.Assert(startingSpan.Length > 0);
+
+                // Now compare the rest of the branches against it.
+                for (int i = 1; i < children.Count; i++)
+                {
+                    // Get the starting node of the next branch.
+                    startingNode = FindBranchOneMultiStart(children[i]);
+                    if (startingNode is null || startingNode.Options != startingNodeOptions)
+                    {
+                        return this;
+                    }
  
-                        RegexCharClass prevCharClass;
-                        if (prev.Type == One)
+                    // See if the new branch's prefix has a shared prefix with the current one.
+                    // If it does, shorten to that; if it doesn't, bail.
+                    if (startingNode.Type == One)
+                    {
+                        if (startingSpan[0] != startingNode.Ch)
                          {
-                            prevCharClass = new RegexCharClass();
-                            prevCharClass.AddChar(prev.Ch);
+                            return this;
                          }
-                        else
+
+                        if (startingSpan.Length != 1)
                          {
-                            prevCharClass = RegexCharClass.Parse(prev.Str!);
+                            startingSpan = startingSpan.Slice(0, 1);
                          }
+                    }
+                    else
+                    {
+                        Debug.Assert(startingNode.Type == Multi);
+                        Debug.Assert(startingNode.Str!.Length > 0);
  
-                        if (at.Type == One)
+                        int minLength = Math.Min(startingSpan.Length, startingNode.Str.Length);
+                        int c = 0;
+                        while (c < minLength && startingSpan[c] == startingNode.Str[c]) c++;
+                        if (c == 0)
                          {
-                            prevCharClass.AddChar(at.Ch);
+                            return this;
+                        }
+
+                        startingSpan = startingSpan.Slice(0, c);
+                    }
+                }
+
+                // If we get here, we have a starting string prefix shared by all branches.
+                Debug.Assert(startingSpan.Length > 0);
+
+                // Now remove the prefix from each branch.
+                for (int i = 0; i < children.Count; i++)
+                {
+                    RegexNode branch = children[i];
+                    if (branch.Type == Concatenate)
+                    {
+                        ProcessOneOrMulti(branch.Child(0), startingSpan);
+                        ReplaceChild(i, branch.Reduce());
+                    }
+                    else
+                    {
+                        ProcessOneOrMulti(branch, startingSpan);
+                    }
+
+                    // Remove the starting text from the one or multi node.  This may end up changing
+                    // the type of the node to be Empty if the starting text matches the node's full value.
+                    static void ProcessOneOrMulti(RegexNode node, ReadOnlySpan<char> startingSpan)
+                    {
+                        if (node.Type == One)
+                        {
+                            Debug.Assert(startingSpan.Length == 1);
+                            Debug.Assert(startingSpan[0] == node.Ch);
+                            node.Type = Empty;
+                            node.Ch = '\0';
                          }
                          else
                          {
-                            RegexCharClass atCharClass = RegexCharClass.Parse(at.Str!);
-                            prevCharClass.AddCharClass(atCharClass);
+                            Debug.Assert(node.Type == Multi);
+                            Debug.Assert(node.Str.AsSpan().StartsWith(startingSpan, StringComparison.Ordinal));
+                            if (node.Str!.Length == startingSpan.Length)
+                            {
+                                node.Type = Empty;
+                                node.Str = null;
+                            }
+                            else if (node.Str.Length - 1 == startingSpan.Length)
+                            {
+                                node.Type = One;
+                                node.Ch = node.Str[^1];
+                                node.Str = null;
+                            }
+                            else
+                            {
+                                node.Str = node.Str.Substring(startingSpan.Length);
+                            }
                          }
+                    }
+                }
  
-                        prev.Type = Set;
-                        prev.Str = prevCharClass.ToStringClass();
+                // We may have changed multiple branches to be Empty, but we only need to keep
+                // the first (keeping the rest would just duplicate work in backtracking, though
+                // it would also mean the original regex had at least two identical branches).
+                for (int firstEmpty = 0; firstEmpty < children.Count; firstEmpty++)
+                {
+                    if (children[firstEmpty].Type != Empty)
+                    {
+                        continue;
                      }
-                    else if (at.Type == Nothing)
+
+                    // Found the first empty.  Now starting after it, remove all subsequent found Empty nodes,
+                    // pushing everything else down. (In the future, should we want to there's also the opportunity
+                    // here to remove other duplication, but such duplication is a more egregious mistake on the
+                    // part of the expression author.)
+                    int i = firstEmpty + 1;
+                    int j = i;
+                    while (i < children.Count)
                      {
-                        j--;
+                        if (children[i].Type != Empty)
+                        {
+                            if (j != i)
+                            {
+                                children[j] = children[i];
+                            }
+                            j++;
+                        }
+                        i++;
                      }
-                    else
+
+                    if (j < i)
                      {
-                        wasLastSet = false;
-                        lastNodeCannotMerge = false;
+                        children.RemoveRange(j, i - j);
                      }
+
                      break;
                  }
-            }
  
-            if (j < i)
-            {
-                children.RemoveRange(j, i - j);
-            }
+                var concat = new RegexNode(Concatenate, Options); // use same options as the Alternate
+                concat.AddChild(startingSpan.Length == 1 ? // use same options as the branches
+                    new RegexNode(One, startingNodeOptions) { Ch = startingSpan[0] } :
+                    new RegexNode(Multi, startingNodeOptions) { Str = originalStartingString?.Length == startingSpan.Length ? originalStartingString : startingSpan.ToString() });
+                concat.AddChild(this); // this will re-reduce the node, allowing for newly exposed possible optimizations in what came after the prefix
+                return concat;
+
+                // Finds the starting one or multi of the branch, if it has one; otherwise, returns null.
+                // For simplicity, this only considers branches that are One or Multi, or a Concatenation
+                // beginning with a One or Multi.  We don't traverse more than one level to avoid the
+                // complication of then having to later update that hierarchy when removing the prefix,
+                // but it could be done in the future if proven beneficial enough.
+                static RegexNode? FindBranchOneMultiStart(RegexNode branch)
+                {
+                    if (branch.Type == Concatenate)
+                    {
+                        branch = branch.Child(0);
+                    }
  
-            return StripEnation(Nothing);
+                    return branch.Type == One || branch.Type == Multi ? branch : null;
+                }
+            }
          }
  
          /// <summary>
@@ -1238,20 +1449,21 @@ namespace System.Text.RegularExpressions
  
          public void AddChild(RegexNode newChild)
          {
-            RegexNode reducedChild = newChild.Reduce();
-            reducedChild.Next = this;
+            newChild.Next = this; // so that the child can see its parent while being reduced
+            newChild = newChild.Reduce();
+            newChild.Next = this; // in case Reduce returns a different node that needs to be reparented
  
              if (Children is null)
              {
-                Children = reducedChild;
+                Children = newChild;
              }
              else if (Children is RegexNode currentChild)
              {
-                Children = new List<RegexNode>() { currentChild, reducedChild };
+                Children = new List<RegexNode>() { currentChild, newChild };
              }
              else
              {
-                ((List<RegexNode>)Children).Add(reducedChild);
+                ((List<RegexNode>)Children).Add(newChild);
              }
          }
  
@@ -1301,7 +1513,6 @@ namespace System.Text.RegularExpressions
          [ExcludeFromCodeCoverage]
          public string Description()
          {
-
              string typeStr = Type switch
              {
                  Oneloop => nameof(Oneloop),
@@ -1344,17 +1555,15 @@ namespace System.Text.RegularExpressions
                  _ => $"(unknown {Type})"
              };
  
-            var argSb = new StringBuilder().Append(typeStr);
-
-            if ((Options & RegexOptions.ExplicitCapture) != 0) argSb.Append("-C");
-            if ((Options & RegexOptions.IgnoreCase) != 0) argSb.Append("-I");
-            if ((Options & RegexOptions.RightToLeft) != 0) argSb.Append("-L");
-            if ((Options & RegexOptions.Multiline) != 0) argSb.Append("-M");
-            if ((Options & RegexOptions.Singleline) != 0) argSb.Append("-S");
-            if ((Options & RegexOptions.IgnorePatternWhitespace) != 0) argSb.Append("-X");
-            if ((Options & RegexOptions.ECMAScript) != 0) argSb.Append("-E");
+            var sb = new StringBuilder(typeStr);
  
-            argSb.Append(Indent());
+            if ((Options & RegexOptions.ExplicitCapture) != 0) sb.Append("-C");
+            if ((Options & RegexOptions.IgnoreCase) != 0) sb.Append("-I");
+            if ((Options & RegexOptions.RightToLeft) != 0) sb.Append("-L");
+            if ((Options & RegexOptions.Multiline) != 0) sb.Append("-M");
+            if ((Options & RegexOptions.Singleline) != 0) sb.Append("-S");
+            if ((Options & RegexOptions.IgnorePatternWhitespace) != 0) sb.Append("-X");
+            if ((Options & RegexOptions.ECMAScript) != 0) sb.Append("-E");
  
              switch (Type)
              {
@@ -1366,25 +1575,27 @@ namespace System.Text.RegularExpressions
                  case Notonelazy:
                  case One:
                  case Notone:
-                    argSb.Append(RegexCharClass.CharDescription(Ch));
+                    sb.Append(" '").Append(RegexCharClass.CharDescription(Ch)).Append('\'');
                      break;
                  case Capture:
-                    argSb.Append("index = " + M);
+                    sb.Append(' ').Append($"index = {M}");
                      if (N != -1)
-                        argSb.Append(", unindex = " + N);
+                    {
+                        sb.Append($", unindex = {N}");
+                    }
                      break;
                  case Ref:
                  case Testref:
-                    argSb.Append("index = " + M);
+                    sb.Append(' ').Append($"index = {M}");
                      break;
                  case Multi:
-                    argSb.Append(Str);
+                    sb.Append(" \"").Append(Str).Append('"');
                      break;
                  case Set:
                  case Setloop:
                  case Setloopatomic:
                  case Setlazy:
-                    argSb.Append(RegexCharClass.SetDescription(Str!));
+                    sb.Append(' ').Append(RegexCharClass.SetDescription(Str!));
                      break;
              }
  
@@ -1401,19 +1612,16 @@ namespace System.Text.RegularExpressions
                  case Setlazy:
                  case Loop:
                  case Lazyloop:
-                    if (argSb[^1] != ' ')
-                        argSb.Append(", ");
-                    argSb.Append("min = " + M + ", max = ");
-                    if (N == int.MaxValue)
-                        argSb.Append("inf");
-                    else
-                        argSb.Append(N);
+                    sb.Append(
+                        (M == 0 && N == int.MaxValue) ? "*" :
+                        (M == 0 && N == 1) ? "?" :
+                        (M == 1 && N == int.MaxValue) ? "+" :
+                        (N == int.MaxValue) ? $"{{{M}, *}}" :
+                        $"{{{M}, {N}}}");
                      break;
              }
  
-            string Indent() => new string(' ', Math.Max(1, 25 - argSb.Length));
-
-            return argSb.ToString();
+            return sb.ToString();
          }
  
          [ExcludeFromCodeCoverage]
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexRunner.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexRunner.cs

index acaf1e2..45af2b5 100644 (file)
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexRunner.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexRunner.cs
@@ -680,14 +680,18 @@ namespace System.Text.RegularExpressions
              sb.Append(a.Length);
  
              if (sb.Length < 8)
+            {
                  sb.Append(' ', 8 - sb.Length);
+            }
  
              sb.Append('(');
  
              for (int i = index; i < a.Length; i++)
              {
                  if (i > index)
+                {
                      sb.Append(' ');
+                }
                  sb.Append(a[i]);
              }
  
@@ -704,12 +708,18 @@ namespace System.Text.RegularExpressions
              sb.Append(runtextpos);
  
              if (sb.Length < 8)
+            {
                  sb.Append(' ', 8 - sb.Length);
+            }
  
              if (runtextpos > runtextbeg)
+            {
                  sb.Append(RegexCharClass.CharDescription(runtext![runtextpos - 1]));
+            }
              else
+            {
                  sb.Append('^');
+            }
  
              sb.Append('>');
  
diff --git a/src/libraries/System.Text.RegularExpressions/tests/Regex.UnicodeChar.Tests.cs b/src/libraries/System.Text.RegularExpressions/tests/Regex.UnicodeChar.Tests.cs

index 133c95a..ebca100 100644 (file)
--- a/src/libraries/System.Text.RegularExpressions/tests/Regex.UnicodeChar.Tests.cs
+++ b/src/libraries/System.Text.RegularExpressions/tests/Regex.UnicodeChar.Tests.cs
@@ -55,16 +55,20 @@ namespace System.Text.RegularExpressions
  
              for (int i = 0; i < 100; i++)
              {
-                StringBuilder builder1 = new StringBuilder();
-                StringBuilder builder2 = new StringBuilder();
+                var builder1 = new StringBuilder();
+                var builder2 = new StringBuilder();
+
                  for (int j = 0; j < validCharLength; j++)
                  {
                      char c = validChars[random.Next(validChars.Count)];
                      builder1.Append(c);
                      builder2.Append(c);
                  }
+
                  for (int j = 0; j < invalidCharLength; j++)
+                {
                      builder1.Append(invalidChars[random.Next(invalidChars.Count)]);
+                }
  
                  string input = builder1.ToString();
                  Match match = regex.Match(input);
@@ -94,18 +98,26 @@ namespace System.Text.RegularExpressions
  
              for (int i = 0; i < 500; i++)
              {
-                StringBuilder builder1 = new StringBuilder();
-                StringBuilder builder2 = new StringBuilder();
+                var builder1 = new StringBuilder();
+                var builder2 = new StringBuilder();
+
                  for (int j = 0; j < invalidCharLength; j++)
+                {
                      builder1.Append(invalidChars[random.Next(invalidChars.Count)]);
+                }
+
                  for (int j = 0; j < validCharLength; j++)
                  {
                      char c = validChars[random.Next(validChars.Count)];
                      builder1.Append(c);
                      builder2.Append(c);
                  }
+
                  for (int j = 0; j < invalidCharLength; j++)
+                {
                      builder1.Append(invalidChars[random.Next(invalidChars.Count)]);
+                }
+
                  string input = builder1.ToString();
  
                  Match match = regex.Match(input);
@@ -143,16 +155,21 @@ namespace System.Text.RegularExpressions
  
              for (int i = 0; i < 100; i++)
              {
-                StringBuilder builder1 = new StringBuilder();
-                StringBuilder builder2 = new StringBuilder();
+                var builder1 = new StringBuilder();
+                var builder2 = new StringBuilder();
+
                  for (int j = 0; j < validCharLength; j++)
                  {
                      char c = validChars[random.Next(validChars.Count)];
                      builder1.Append(c);
                      builder2.Append(c);
                  }
+
                  for (int j = 0; j < invalidCharLength; j++)
+                {
                      builder1.Append(invalidChars[random.Next(invalidChars.Count)]);
+                }
+
                  string input = builder1.ToString();
                  Match match = regex.Match(input);
  
@@ -173,18 +190,26 @@ namespace System.Text.RegularExpressions
  
              for (int i = 0; i < 100; i++)
              {
-                StringBuilder builder1 = new StringBuilder();
-                StringBuilder builder2 = new StringBuilder();
+                var builder1 = new StringBuilder();
+                var builder2 = new StringBuilder();
+
                  for (int j = 0; j < invalidCharLength; j++)
+                {
                      builder1.Append(invalidChars[random.Next(invalidChars.Count)]);
+                }
+
                  for (int j = 0; j < validCharLength; j++)
                  {
                      char c = validChars[random.Next(validChars.Count)];
                      builder1.Append(c);
                      builder2.Append(c);
                  }
+
                  for (int j = 0; j < invalidCharLength; j++)
+                {
                      builder1.Append(invalidChars[random.Next(invalidChars.Count)]);
+                }
+
                  string input = builder1.ToString();
  
                  Match match = regex.Match(input);
diff --git a/src/libraries/System.Text.RegularExpressions/tests/RegexReductionTests.cs b/src/libraries/System.Text.RegularExpressions/tests/RegexReductionTests.cs

index 7eb9e1d..b622ec6 100644 (file)
--- a/src/libraries/System.Text.RegularExpressions/tests/RegexReductionTests.cs
+++ b/src/libraries/System.Text.RegularExpressions/tests/RegexReductionTests.cs
@@ -277,8 +277,31 @@ namespace System.Text.RegularExpressions.Tests
          [InlineData("(?:a+){4}", "a{4,}")]
          [InlineData("(?:a{1,2}){4}", "a{4,8}")]
          // Alternation reduction
+        [InlineData("a|b", "[ab]")]
          [InlineData("a|b|c|d|e|g|h|z", "[a-eghz]")]
          [InlineData("a|b|c|def|g|h", "[a-c]|def|[gh]")]
+        [InlineData("this|that|there|then|those", "th(?:is|at|ere|en|ose)")]
+        [InlineData("it's (?>this|that|there|then|those)", "it's (?>th(?:is|at|ere|en|ose))")]
+        [InlineData("it's (?>this|that|there|then|those)!", "it's (?>th(?:is|at|ere|en|ose))!")]
+        [InlineData("abcd|abce", "abc[de]")]
+        [InlineData("abcd|abef", "ab(?:cd|ef)")]
+        [InlineData("abcd|aefg", "a(?:bcd|efg)")]
+        [InlineData("abcd|abc|ab|a", "a(?:bcd|bc|b|)")]
+        [InlineData("abcde|abcdef", "abcde(?:|f)")]
+        [InlineData("abcdef|abcde", "abcde(?:f|)")]
+        [InlineData("abcdef|abcdeg|abcdeh|abcdei|abcdej|abcdek|abcdel", "abcde[f-l]")]
+        [InlineData("(ab|ab*)bc", "(a(?:b|b*))bc")]
+        [InlineData("abc(?:defgh|defij)klmn", "abcdef(?:gh|ij)klmn")]
+        [InlineData("abc(defgh|defij)klmn", "abc(def(?:gh|ij))klmn")]
+        [InlineData("a[b-f]|a[g-k]", "a[b-k]")]
+        [InlineData("this|this", "this")]
+        [InlineData("this|this|this", "this")]
+        [InlineData("hello there|hello again|hello|hello|hello|hello", "hello(?: there| again|)")]
+        [InlineData("hello there|hello again|hello|hello|hello|hello|hello world", "hello(?: there| again|| world)")]
+        [InlineData("hello there|hello again|hello|hello|hello|hello|hello world|hello", "hello(?: there| again|| world)")]
+        [InlineData("abcd(?:(?i:e)|(?i:f))", "abcd(?i:[ef])")]
+        [InlineData("(?i:abcde)|(?i:abcdf)", "(?i:abcd[ef])")]
+        [InlineData("xyz(?:(?i:abcde)|(?i:abcdf))", "xyz(?i:abcd[ef])")]
          // Auto-atomicity
          [InlineData("a*b", "(?>a*)b")]
          [InlineData("a*b+", "(?>a*)b+")]
@@ -294,11 +317,16 @@ namespace System.Text.RegularExpressions.Tests
          [InlineData("(?:abc*|def*)g", "(?:ab(?>c*)|de(?>f*))g")]
          [InlineData("(?:a[ce]*|b*)g", "(?:a(?>[ce]*)|(?>b*))g")]
          [InlineData("(?:a[ce]*|b*)c", "(?:a[ce]*|(?>b*))c")]
+        [InlineData("apple|(?:orange|pear)|grape", "apple|orange|pear|grape")]
+        [InlineData("(?>(?>(?>(?:abc)*)))", "(?:abc)*")]
          public void PatternsReduceIdentically(string pattern1, string pattern2)
          {
              AssertExtensions.Equal(GetRegexCodes(new Regex(pattern1)), GetRegexCodes(new Regex(pattern2)));
-            Assert.NotEqual<int>(GetRegexCodes(new Regex(pattern1, RegexOptions.IgnoreCase)), GetRegexCodes(new Regex(pattern2)));
              Assert.NotEqual<int>(GetRegexCodes(new Regex(pattern1, RegexOptions.RightToLeft)), GetRegexCodes(new Regex(pattern2)));
+            if (!pattern1.Contains("?i:") && !pattern2.Contains("?i:"))
+            {
+                Assert.NotEqual<int>(GetRegexCodes(new Regex(pattern1, RegexOptions.IgnoreCase)), GetRegexCodes(new Regex(pattern2)));
+            }
          }
  
          [Theory]
@@ -341,6 +369,10 @@ namespace System.Text.RegularExpressions.Tests
          [InlineData("[ace][ace]{0,2147483646}", "[ace]{0,2147483647}")]
          [InlineData("[ace]{2147482647}[ace]{1000}", "[ace]{2147483647}")]
          [InlineData("[ace]{0,2147482647}[ace]{0,1000}", "[ace]{0,2147483647}")]
+        // Not reducing branches of alternations with different casing
+        [InlineData("(?i:abcd)|abcd", "abcd|abcd")]
+        [InlineData("abcd|(?i:abcd)", "abcd|abcd")]
+        [InlineData("abc(?:(?i:e)|f)", "abc[ef]")]
          // Not applying auto-atomicity
          [InlineData("a*b*", "(?>a*)b*")]
          [InlineData("[^\n]*\n*", "(?>[^\n]*)\n")]
author	Stephen Toub <stoub@microsoft.com>
	Mon, 27 Jan 2020 19:56:04 +0000 (14:56 -0500)
committer	GitHub <noreply@github.com>
	Mon, 27 Jan 2020 19:56:04 +0000 (14:56 -0500)
src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.cs		patch \| blob \| history
src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexBoyerMoore.cs		patch \| blob \| history
src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs		patch \| blob \| history
src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCode.cs		patch \| blob \| history
src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs		patch \| blob \| history
src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexFCD.cs		patch \| blob \| history
src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs		patch \| blob \| history
src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexRunner.cs		patch \| blob \| history
src/libraries/System.Text.RegularExpressions/tests/Regex.UnicodeChar.Tests.cs		patch \| blob \| history
src/libraries/System.Text.RegularExpressions/tests/RegexReductionTests.cs		patch \| blob \| history