From d1d7ee27f1962b4c6830f056298cd08cf76563b9 Mon Sep 17 00:00:00 2001 From: Viktor Hofer Date: Thu, 15 Feb 2018 01:02:08 +0100 Subject: [PATCH] reduce regex op code time (dotnet/corefx#26877) * reduce regex op code time * Use ValueListBuilder and stackalloc during op code generation * Increasing op array size Commit migrated from https://github.com/dotnet/corefx/commit/1eb5ddf1d0a3dbc0a221769a45d81d2a2b7fa81d --- .../src/System.Text.RegularExpressions.csproj | 4 +- .../System/Text/RegularExpressions/RegexWriter.cs | 339 ++++++++------------- .../ResizableValueListBuilder.cs | 76 +++++ 3 files changed, 211 insertions(+), 208 deletions(-) create mode 100644 src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/ResizableValueListBuilder.cs diff --git a/src/libraries/System.Text.RegularExpressions/src/System.Text.RegularExpressions.csproj b/src/libraries/System.Text.RegularExpressions/src/System.Text.RegularExpressions.csproj index 8b3d02d..49efc8a 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System.Text.RegularExpressions.csproj +++ b/src/libraries/System.Text.RegularExpressions/src/System.Text.RegularExpressions.csproj @@ -33,6 +33,7 @@ + @@ -58,6 +59,7 @@ + @@ -66,4 +68,4 @@ - \ No newline at end of file + diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexWriter.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexWriter.cs index 734f42f..d75c851 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexWriter.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexWriter.cs @@ -18,21 +18,30 @@ using System.Globalization; namespace System.Text.RegularExpressions { - internal sealed class RegexWriter + internal ref struct RegexWriter { - private int[] _intStack; - private int _depth; - private int[] _emitted; - private int _curpos; - private readonly Dictionary _stringhash; - private readonly List _stringtable; - private bool _counting; - private int _count; - private int _trackcount; - private Hashtable _caps; - private const int BeforeChild = 64; private const int AfterChild = 128; + // Distribution of common patterns indicates an average amount of 56 op codes. + private const int EmittedSize = 56; + private const int IntStackSize = 32; + + private ResizableValueListBuilder _emitted; + private ResizableValueListBuilder _intStack; + private readonly Dictionary _stringHash; + private readonly List _stringTable; + private Hashtable _caps; + private int _trackCount; + + private RegexWriter(Span emittedSpan, Span intStackSpan) + { + _emitted = new ResizableValueListBuilder(emittedSpan); + _intStack = new ResizableValueListBuilder(intStackSpan); + _stringHash = new Dictionary(); + _stringTable = new List(); + _caps = null; + _trackCount = 0; + } /// /// This is the only function that should be called from outside. @@ -52,55 +61,86 @@ namespace System.Text.RegularExpressions return retval; } - // Private constructor; can't be created outside - private RegexWriter() - { - _intStack = new int[32]; - _emitted = new int[32]; - _stringhash = new Dictionary(); - _stringtable = new List(); - } - /// - /// To avoid recursion, we use a simple integer stack. - /// This is the push. + /// The top level RegexCode generator. It does a depth-first walk + /// through the tree and calls EmitFragment to emits code before + /// and after each child of an interior node, and at each leaf. /// - private void PushInt(int i) + public RegexCode RegexCodeFromRegexTree(RegexTree tree) { - if (_depth >= _intStack.Length) + Span emittedSpan = stackalloc int[EmittedSize]; + Span intStackSpan = stackalloc int[IntStackSize]; + RegexWriter writer = new RegexWriter(emittedSpan, intStackSpan); + + // construct sparse capnum mapping if some numbers are unused + int capsize; + if (tree._capnumlist == null || tree._captop == tree._capnumlist.Length) + { + capsize = tree._captop; + writer._caps = null; + } + else { - int[] expanded = new int[_depth * 2]; + capsize = tree._capnumlist.Length; + writer._caps = tree._caps; + for (int i = 0; i < tree._capnumlist.Length; i++) + writer._caps[tree._capnumlist[i]] = i; + } + + RegexNode curNode = tree._root; + int curChild = 0; + + writer.Emit(RegexCode.Lazybranch, 0); + + for (; ; ) + { + if (curNode._children == null) + { + writer.EmitFragment(curNode._type, curNode, 0); + } + else if (curChild < curNode._children.Count) + { + writer.EmitFragment(curNode._type | BeforeChild, curNode, curChild); - Array.Copy(_intStack, 0, expanded, 0, _depth); + curNode = curNode._children[curChild]; + writer._intStack.Append(curChild); + curChild = 0; + continue; + } + + if (writer._intStack.Length == 0) + break; + + curChild = writer._intStack.Pop(); + curNode = curNode._next; - _intStack = expanded; + writer.EmitFragment(curNode._type | AfterChild, curNode, curChild); + curChild++; } - _intStack[_depth++] = i; - } + writer.PatchJump(0, writer._emitted.Length); + writer.Emit(RegexCode.Stop); - /// - /// true if the stack is empty. - /// - private bool EmptyStack() - { - return _depth == 0; - } + RegexPrefix fcPrefix = RegexFCD.FirstChars(tree); + RegexPrefix prefix = RegexFCD.Prefix(tree); + bool rtl = ((tree._options & RegexOptions.RightToLeft) != 0); - /// - /// This is the pop. - /// - private int PopInt() - { - return _intStack[--_depth]; - } + CultureInfo culture = (tree._options & RegexOptions.CultureInvariant) != 0 ? CultureInfo.InvariantCulture : CultureInfo.CurrentCulture; + RegexBoyerMoore bmPrefix; - /// - /// Returns the current position in the emitted code. - /// - private int CurPos() - { - return _curpos; + if (prefix != null && prefix.Prefix.Length > 0) + bmPrefix = new RegexBoyerMoore(prefix.Prefix, prefix.CaseInsensitive, rtl, culture); + else + bmPrefix = null; + + int anchors = RegexFCD.Anchors(tree); + int[] emitted = writer._emitted.AsReadOnlySpan().ToArray(); + + // Cleaning up and returning the borrowed arrays + writer._emitted.Dispose(); + writer._intStack.Dispose(); + + return new RegexCode(emitted, writer._stringTable, writer._trackCount, writer._caps, capsize, bmPrefix, fcPrefix, anchors, rtl); } /// @@ -119,14 +159,10 @@ namespace System.Text.RegularExpressions /// private void Emit(int op) { - if (_counting) - { - _count += 1; - if (RegexCode.OpcodeBacktracks(op)) - _trackcount += 1; - return; - } - _emitted[_curpos++] = op; + if (RegexCode.OpcodeBacktracks(op)) + _trackCount++; + + _emitted.Append(op); } /// @@ -134,15 +170,11 @@ namespace System.Text.RegularExpressions /// private void Emit(int op, int opd1) { - if (_counting) - { - _count += 2; - if (RegexCode.OpcodeBacktracks(op)) - _trackcount += 1; - return; - } - _emitted[_curpos++] = op; - _emitted[_curpos++] = opd1; + if (RegexCode.OpcodeBacktracks(op)) + _trackCount++; + + _emitted.Append(op); + _emitted.Append(opd1); } /// @@ -150,16 +182,12 @@ namespace System.Text.RegularExpressions /// private void Emit(int op, int opd1, int opd2) { - if (_counting) - { - _count += 3; - if (RegexCode.OpcodeBacktracks(op)) - _trackcount += 1; - return; - } - _emitted[_curpos++] = op; - _emitted[_curpos++] = opd1; - _emitted[_curpos++] = opd2; + if (RegexCode.OpcodeBacktracks(op)) + _trackCount++; + + _emitted.Append(op); + _emitted.Append(opd1); + _emitted.Append(opd2); } /// @@ -168,18 +196,15 @@ namespace System.Text.RegularExpressions /// private int StringCode(string str) { - if (_counting) - return 0; - if (str == null) str = string.Empty; int i; - if (!_stringhash.TryGetValue(str, out i)) + if (!_stringHash.TryGetValue(str, out i)) { - i = _stringtable.Count; - _stringhash[str] = i; - _stringtable.Add(str); + i = _stringTable.Count; + _stringHash[str] = i; + _stringTable.Add(str); } return i; @@ -203,106 +228,6 @@ namespace System.Text.RegularExpressions } /// - /// The top level RegexCode generator. It does a depth-first walk - /// through the tree and calls EmitFragment to emits code before - /// and after each child of an interior node, and at each leaf. - /// - /// It runs two passes, first to count the size of the generated - /// code, and second to generate the code. - /// - /// We should time it against the alternative, which is - /// to just generate the code and grow the array as we go. - /// - private RegexCode RegexCodeFromRegexTree(RegexTree tree) - { - RegexNode curNode; - int curChild; - int capsize; - RegexPrefix fcPrefix; - RegexPrefix prefix; - int anchors; - RegexBoyerMoore bmPrefix; - bool rtl; - - // construct sparse capnum mapping if some numbers are unused - - if (tree._capnumlist == null || tree._captop == tree._capnumlist.Length) - { - capsize = tree._captop; - _caps = null; - } - else - { - capsize = tree._capnumlist.Length; - _caps = tree._caps; - for (int i = 0; i < tree._capnumlist.Length; i++) - _caps[tree._capnumlist[i]] = i; - } - - _counting = true; - - for (; ;) - { - if (!_counting) - _emitted = new int[_count]; - - curNode = tree._root; - curChild = 0; - - Emit(RegexCode.Lazybranch, 0); - - for (; ;) - { - if (curNode._children == null) - { - EmitFragment(curNode._type, curNode, 0); - } - else if (curChild < curNode._children.Count) - { - EmitFragment(curNode._type | BeforeChild, curNode, curChild); - - curNode = curNode._children[curChild]; - PushInt(curChild); - curChild = 0; - continue; - } - - if (EmptyStack()) - break; - - curChild = PopInt(); - curNode = curNode._next; - - EmitFragment(curNode._type | AfterChild, curNode, curChild); - curChild++; - } - - PatchJump(0, CurPos()); - Emit(RegexCode.Stop); - - if (!_counting) - break; - - _counting = false; - } - - fcPrefix = RegexFCD.FirstChars(tree); - - prefix = RegexFCD.Prefix(tree); - rtl = ((tree._options & RegexOptions.RightToLeft) != 0); - - CultureInfo culture = (tree._options & RegexOptions.CultureInvariant) != 0 ? CultureInfo.InvariantCulture : CultureInfo.CurrentCulture; - if (prefix != null && prefix.Prefix.Length > 0) - bmPrefix = new RegexBoyerMoore(prefix.Prefix, prefix.CaseInsensitive, rtl, culture); - else - bmPrefix = null; - - anchors = RegexFCD.Anchors(tree); - - return new RegexCode(_emitted, _stringtable, _trackcount, _caps, capsize, bmPrefix, fcPrefix, anchors, rtl); - } - - /// /// The main RegexCode generator. It does a depth-first walk /// through the tree and calls EmitFragment to emits code before /// and after each child of an interior node, and at each leaf. @@ -329,7 +254,7 @@ namespace System.Text.RegularExpressions case RegexNode.Alternate | BeforeChild: if (curIndex < node._children.Count - 1) { - PushInt(CurPos()); + _intStack.Append(_emitted.Length); Emit(RegexCode.Lazybranch, 0); } break; @@ -338,17 +263,17 @@ namespace System.Text.RegularExpressions { if (curIndex < node._children.Count - 1) { - int LBPos = PopInt(); - PushInt(CurPos()); + int LBPos = _intStack.Pop(); + _intStack.Append(_emitted.Length); Emit(RegexCode.Goto, 0); - PatchJump(LBPos, CurPos()); + PatchJump(LBPos, _emitted.Length); } else { int I; for (I = 0; I < curIndex; I++) { - PatchJump(PopInt(), CurPos()); + PatchJump(_intStack.Pop(), _emitted.Length); } } break; @@ -359,7 +284,7 @@ namespace System.Text.RegularExpressions { case 0: Emit(RegexCode.Setjump); - PushInt(CurPos()); + _intStack.Append(_emitted.Length); Emit(RegexCode.Lazybranch, 0); Emit(RegexCode.Testref, MapCapnum(node._m)); Emit(RegexCode.Forejump); @@ -372,10 +297,10 @@ namespace System.Text.RegularExpressions { case 0: { - int Branchpos = PopInt(); - PushInt(CurPos()); + int Branchpos = _intStack.Pop(); + _intStack.Append(_emitted.Length); Emit(RegexCode.Goto, 0); - PatchJump(Branchpos, CurPos()); + PatchJump(Branchpos, _emitted.Length); Emit(RegexCode.Forejump); if (node._children.Count > 1) break; @@ -383,7 +308,7 @@ namespace System.Text.RegularExpressions goto case 1; } case 1: - PatchJump(PopInt(), CurPos()); + PatchJump(_intStack.Pop(), _emitted.Length); break; } break; @@ -394,7 +319,7 @@ namespace System.Text.RegularExpressions case 0: Emit(RegexCode.Setjump); Emit(RegexCode.Setmark); - PushInt(CurPos()); + _intStack.Append(_emitted.Length); Emit(RegexCode.Lazybranch, 0); break; } @@ -408,10 +333,10 @@ namespace System.Text.RegularExpressions Emit(RegexCode.Forejump); break; case 1: - int Branchpos = PopInt(); - PushInt(CurPos()); + int Branchpos = _intStack.Pop(); + _intStack.Append(_emitted.Length); Emit(RegexCode.Goto, 0); - PatchJump(Branchpos, CurPos()); + PatchJump(Branchpos, _emitted.Length); Emit(RegexCode.Getmark); Emit(RegexCode.Forejump); @@ -420,7 +345,7 @@ namespace System.Text.RegularExpressions // else fallthrough goto case 2; case 2: - PatchJump(PopInt(), CurPos()); + PatchJump(_intStack.Pop(), _emitted.Length); break; } break; @@ -435,25 +360,25 @@ namespace System.Text.RegularExpressions if (node._m == 0) { - PushInt(CurPos()); + _intStack.Append(_emitted.Length); Emit(RegexCode.Goto, 0); } - PushInt(CurPos()); + _intStack.Append(_emitted.Length); break; case RegexNode.Loop | AfterChild: case RegexNode.Lazyloop | AfterChild: { - int StartJumpPos = CurPos(); + int StartJumpPos = _emitted.Length; int Lazy = (nodetype - (RegexNode.Loop | AfterChild)); if (node._n < int.MaxValue || node._m > 1) - Emit(RegexCode.Branchcount + Lazy, PopInt(), node._n == int.MaxValue ? int.MaxValue : node._n - node._m); + Emit(RegexCode.Branchcount + Lazy, _intStack.Pop(), node._n == int.MaxValue ? int.MaxValue : node._n - node._m); else - Emit(RegexCode.Branchmark + Lazy, PopInt()); + Emit(RegexCode.Branchmark + Lazy, _intStack.Pop()); if (node._m == 0) - PatchJump(PopInt(), StartJumpPos); + PatchJump(_intStack.Pop(), StartJumpPos); } break; @@ -489,13 +414,13 @@ namespace System.Text.RegularExpressions case RegexNode.Prevent | BeforeChild: Emit(RegexCode.Setjump); - PushInt(CurPos()); + _intStack.Append(_emitted.Length); Emit(RegexCode.Lazybranch, 0); break; case RegexNode.Prevent | AfterChild: Emit(RegexCode.Backjump); - PatchJump(PopInt(), CurPos()); + PatchJump(_intStack.Pop(), _emitted.Length); Emit(RegexCode.Forejump); break; diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/ResizableValueListBuilder.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/ResizableValueListBuilder.cs new file mode 100644 index 0000000..8bebf5e --- /dev/null +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/ResizableValueListBuilder.cs @@ -0,0 +1,76 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System.Buffers; +using System.Diagnostics; +using System.Runtime.CompilerServices; + +namespace System.Collections.Generic +{ + internal ref struct ResizableValueListBuilder + { + private Span _span; + private T[] _arrayFromPool; + private int _pos; + + public ResizableValueListBuilder(Span initialSpan) + { + _span = initialSpan; + _arrayFromPool = null; + _pos = 0; + } + + public ref T this[int index] => ref _span[index]; + + public int Length => _pos; + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public void Append(T item) + { + int pos = _pos; + if (pos >= _span.Length) + Grow(); + + _span[pos] = item; + _pos = pos + 1; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public T Pop() + { + _pos--; + return _span[_pos]; + } + + public ReadOnlySpan AsReadOnlySpan() + { + return _span.Slice(0, _pos); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public void Dispose() + { + if (_arrayFromPool != null) + { + ArrayPool.Shared.Return(_arrayFromPool); + _arrayFromPool = null; + } + } + + private void Grow() + { + T[] array = ArrayPool.Shared.Rent(_span.Length * 2); + + bool success = _span.TryCopyTo(array); + Debug.Assert(success); + + T[] toReturn = _arrayFromPool; + _span = _arrayFromPool = array; + if (toReturn != null) + { + ArrayPool.Shared.Return(toReturn); + } + } + } +} -- 2.7.4