Improve regex compiler / source generator for sets (#84370)

author Stephen Toub <stoub@microsoft.com>

Mon, 10 Apr 2023 16:31:44 +0000 (12:31 -0400)

committer GitHub <noreply@github.com>

Mon, 10 Apr 2023 16:31:44 +0000 (12:31 -0400)
author Stephen Toub <stoub@microsoft.com>
Mon, 10 Apr 2023 16:31:44 +0000 (12:31 -0400)
committer GitHub <noreply@github.com>
Mon, 10 Apr 2023 16:31:44 +0000 (12:31 -0400)
diff --git a/src/libraries/System.Data.Odbc/src/System.Data.Odbc.csproj b/src/libraries/System.Data.Odbc/src/System.Data.Odbc.csproj

index 5efd555..da7a4b5 100644 (file)
--- a/src/libraries/System.Data.Odbc/src/System.Data.Odbc.csproj
+++ b/src/libraries/System.Data.Odbc/src/System.Data.Odbc.csproj
@@ -134,6 +134,9 @@ System.Data.Odbc.OdbcTransaction</PackageDescription>
               Link="Common\DisableRuntimeMarshalling.cs" />
      <Compile Include="$(CommonPath)System\Runtime\InteropServices\HandleRefMarshaller.cs"
               Link="Common\System\Runtime\InteropServices\HandleRefMarshaller.cs" />
+  </ItemGroup>
+
+  <ItemGroup Condition="$([MSBuild]::IsTargetFrameworkCompatible('$(TargetFramework)', 'net8.0'))">
      <ProjectReference Include="..\..\System.Text.RegularExpressions\gen\System.Text.RegularExpressions.Generator.csproj"
                    SetTargetFramework="TargetFramework=netstandard2.0"
                    OutputItemType="Analyzer"
diff --git a/src/libraries/System.Data.OleDb/src/System.Data.OleDb.csproj b/src/libraries/System.Data.OleDb/src/System.Data.OleDb.csproj

index 0d53539..94b0c8c 100644 (file)
--- a/src/libraries/System.Data.OleDb/src/System.Data.OleDb.csproj
+++ b/src/libraries/System.Data.OleDb/src/System.Data.OleDb.csproj
@@ -151,7 +151,7 @@ System.Data.OleDb.OleDbTransaction</PackageDescription>
      <ProjectReference Include="$(LibrariesProjectRoot)System.Diagnostics.PerformanceCounter\src\System.Diagnostics.PerformanceCounter.csproj" />
    </ItemGroup>
  
-  <ItemGroup Condition="$([MSBuild]::IsTargetFrameworkCompatible('$(TargetFramework)', 'net7.0'))">
+  <ItemGroup Condition="$([MSBuild]::IsTargetFrameworkCompatible('$(TargetFramework)', 'net8.0'))">
      <ProjectReference Include="..\..\System.Text.RegularExpressions\gen\System.Text.RegularExpressions.Generator.csproj"
                    SetTargetFramework="TargetFramework=netstandard2.0"
                    OutputItemType="Analyzer"
diff --git a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs

index f0c15fa..7f3c41f 100644 (file)
--- a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs
+++ b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs
@@ -458,7 +458,7 @@ namespace System.Text.RegularExpressions.Generator
              // characters _not_ contained in the set, and then do a search for the inverse of that, which will be
              // all of the target ASCII characters and all of non-ASCII.
              var asciiChars = new List<char>();
-            for (int i = 0; i <= 0x7f; i++)
+            for (int i = 0; i < 128; i++)
              {
                  if (!RegexCharClass.CharInClass((char)i, set))
                  {
@@ -466,6 +466,7 @@ namespace System.Text.RegularExpressions.Generator
                  }
              }
  
+            // If this is a known set, use a predetermined simple name for the helper.
              string? helperName = set switch
              {
                  RegexCharClass.DigitClass => "IndexOfAnyDigit",
@@ -496,6 +497,18 @@ namespace System.Text.RegularExpressions.Generator
  
                  _ => null,
              };
+
+            // If this set is just from a few Unicode categories, derive a name from the categories.
+            if (helperName is null)
+            {
+                Span<UnicodeCategory> categories = stackalloc UnicodeCategory[5]; // arbitrary limit to keep names from being too unwieldy
+                if (RegexCharClass.TryGetOnlyCategories(set, categories, out int numCategories, out bool negatedCategory))
+                {
+                    helperName = $"IndexOfAny{(negatedCategory ? "Except" : "")}{string.Concat(categories.Slice(0, numCategories).ToArray().Select(c => c.ToString()))}";
+                }
+            }
+
+            // As a final fallback, manufacture a name unique to the full set description.
              if (helperName is null)
              {
                  using (SHA256 sha = SHA256.Create())
@@ -522,7 +535,7 @@ namespace System.Text.RegularExpressions.Generator
                            $"    int i = span.IndexOfAnyExcept({EmitIndexOfAnyValues(asciiChars.ToArray(), requiredHelpers)});");
                  lines.Add($"    if ((uint)i < (uint)span.Length)");
                  lines.Add($"    {{");
-                lines.Add($"        if (span[i] <= 0x7f)");
+                lines.Add($"        if (char.IsAscii(span[i]))");
                  lines.Add($"        {{");
                  lines.Add($"            return i;");
                  lines.Add($"        }}");
@@ -1004,9 +1017,13 @@ namespace System.Text.RegularExpressions.Generator
  
                  // Use IndexOf{Any} to accelerate the skip loop via vectorization to match the first prefix.
                  // But we avoid using it for the relatively common case of the starting set being '.', aka anything other than
-                // a newline, as it's very rare to have long, uninterrupted sequences of newlines.
+                // a newline, as it's very rare to have long, uninterrupted sequences of newlines. And we avoid using it
+                // for the case of the starting set being anything (e.g. '.' with SingleLine), as in that case it'll always match
+                // the first char.
                  int setIndex = 0;
-                bool canUseIndexOf = primarySet.Set != RegexCharClass.NotNewLineClass;
+                bool canUseIndexOf =
+                    primarySet.Set != RegexCharClass.NotNewLineClass &&
+                    primarySet.Set != RegexCharClass.AnyClass;
                  bool needLoop = !canUseIndexOf || setsToUse > 1;
  
                  FinishEmitBlock loopBlock = default;
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs

index ab0dd55..71a3004 100644 (file)
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs
@@ -1327,23 +1327,21 @@ namespace System.Text.RegularExpressions
                  return false;
              }
  
-            return CharInCategory(ch, set, start, setLength, categoryLength);
+            return CharInCategory(ch, set.AsSpan(SetStartIndex + start + setLength, categoryLength));
          }
  
-        private static bool CharInCategory(char ch, string set, int start, int setLength, int categoryLength)
+        private static bool CharInCategory(char ch, ReadOnlySpan<char> categorySetSegment)
          {
              UnicodeCategory chcategory = char.GetUnicodeCategory(ch);
  
-            int i = start + SetStartIndex + setLength;
-            int end = i + categoryLength;
-            while (i < end)
+            for (int i = 0; i < categorySetSegment.Length; i++)
              {
-                int curcat = (short)set[i];
+                int curcat = (short)categorySetSegment[i];
  
                  if (curcat == 0)
                  {
                      // zero is our marker for a group of categories - treated as a unit
-                    if (CharInCategoryGroup(chcategory, set, ref i))
+                    if (CharInCategoryGroup(chcategory, categorySetSegment, ref i))
                      {
                          return true;
                      }
@@ -1379,8 +1377,6 @@ namespace System.Text.RegularExpressions
                          return true;
                      }
                  }
-
-                i++;
              }
  
              return false;
@@ -1390,7 +1386,7 @@ namespace System.Text.RegularExpressions
          /// This is used for categories which are composed of other categories - L, N, Z, W...
          /// These groups need special treatment when they are negated
          /// </summary>
-        private static bool CharInCategoryGroup(UnicodeCategory chcategory, string category, ref int i)
+        private static bool CharInCategoryGroup(UnicodeCategory chcategory, ReadOnlySpan<char> category, ref int i)
          {
              int pos = i + 1;
              int curcat = (short)category[pos];
@@ -1717,6 +1713,52 @@ namespace System.Text.RegularExpressions
                          }
                      }
                  }
+
+                // If the class now has a range that includes everything, and if it doesn't have subtraction,
+                // we can remove all of its categories, as they're duplicative (the set already includes everything).
+                if (!_negate &&
+                    _subtractor is null &&
+                    _categories?.Length > 0 &&
+                    rangelist.Count == 1 && rangelist[0].First == 0 && rangelist[0].Last == LastChar)
+                {
+                    _categories.Clear();
+                }
+
+                // If there's only a single character omitted from ranges, if there's no subtractor, and if there are categories,
+                // see if that character is in the categories.  If it is, then we can replace whole thing with a complete "any" range.
+                // If it's not, then we can remove the categories, as they're only duplicating the rest of the range, turning the set
+                // into a "not one". This primarily helps in the case of a synthesized set from analysis that ends up combining '.' with
+                // categories, as we want to reduce that set down to either [^\n] or [\0-\uFFFF]. (This can be extrapolated to any number
+                // of missing characters; in fact, categories in general are superfluous and the entire set can be represented as ranges.
+                // But categories serve as a space optimization, and we strike a balance between testing many characters and the time/complexity
+                // it takes to do so.  Thus, we limit this to the common case of a single missing character.)
+                if (!_negate &&
+                    _subtractor is null &&
+                    _categories?.Length > 0 &&
+                    rangelist.Count == 2 && rangelist[0].First == 0 && rangelist[0].Last + 2 == rangelist[1].First && rangelist[1].Last == LastChar)
+                {
+                    var vsb = new ValueStringBuilder(stackalloc char[256]);
+                    foreach (ReadOnlyMemory<char> chunk in _categories!.GetChunks())
+                    {
+                        vsb.Append(chunk.Span);
+                    }
+
+                    if (CharInCategory((char)(rangelist[0].Last + 1), vsb.AsSpan()))
+                    {
+                        rangelist.RemoveAt(1);
+                        rangelist[0] = ('\0', LastChar);
+                    }
+                    else
+                    {
+                        _negate = true;
+                        rangelist.RemoveAt(1);
+                        char notOne = (char)(rangelist[0].Last + 1);
+                        rangelist[0] = (notOne, notOne);
+                    }
+                    _categories.Clear();
+
+                    vsb.Dispose();
+                }
              }
          }
  
@@ -1792,12 +1834,20 @@ namespace System.Text.RegularExpressions
  
              void RenderRanges()
              {
-                for (; index < SetStartIndex + set[SetLengthIndex]; index += 2)
+                int rangesEnd = SetStartIndex + set[SetLengthIndex];
+                while (index < rangesEnd)
                  {
                      ch1 = set[index];
-                    ch2 = index + 1 < set.Length ?
-                        (char)(set[index + 1] - 1) :
-                        LastChar;
+                    if (index + 1 < rangesEnd)
+                    {
+                        ch2 = (char)(set[index + 1] - 1);
+                        index += 2;
+                    }
+                    else
+                    {
+                        ch2 = LastChar;
+                        index++;
+                    }
  
                      desc.Append(DescribeChar(ch1));
  
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs

index d44a7e6..f9be28a 100644 (file)
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs
@@ -8,6 +8,7 @@ using System.Diagnostics.CodeAnalysis;
  using System.Globalization;
  using System.Reflection;
  using System.Reflection.Emit;
+using System.Runtime.CompilerServices;
  using System.Runtime.InteropServices;
  using System.Threading;
  
@@ -97,6 +98,7 @@ namespace System.Text.RegularExpressions
          private static readonly MethodInfo s_stringGetCharsMethod = typeof(string).GetMethod("get_Chars", new Type[] { typeof(int) })!;
          private static readonly MethodInfo s_arrayResize = typeof(Array).GetMethod("Resize")!.MakeGenericMethod(typeof(int));
          private static readonly MethodInfo s_mathMinIntInt = typeof(Math).GetMethod("Min", new Type[] { typeof(int), typeof(int) })!;
+        private static readonly MethodInfo s_memoryMarshalGetArrayDataReferenceIndexOfAnyValues = typeof(MemoryMarshal).GetMethod("GetArrayDataReference", new Type[] { Type.MakeGenericMethodParameter(0).MakeArrayType() })!.MakeGenericMethod(typeof(IndexOfAnyValues<char>))!;
          // Note:
          // Single-range helpers like IsAsciiLetterLower, IsAsciiLetterUpper, IsAsciiDigit, and IsBetween aren't used here, as the IL generated for those
          // single-range checks is as cheap as the method call, and there's no readability issue as with the source generator.
@@ -834,9 +836,13 @@ namespace System.Text.RegularExpressions
  
                  // Use IndexOf{Any} to accelerate the skip loop via vectorization to match the first prefix.
                  // But we avoid using it for the relatively common case of the starting set being '.', aka anything other than
-                // a newline, as it's very rare to have long, uninterrupted sequences of newlines.
+                // a newline, as it's very rare to have long, uninterrupted sequences of newlines. And we avoid using it
+                // for the case of the starting set being anything (e.g. '.' with SingleLine), as in that case it'll always match
+                // the first char.
                  int setIndex = 0;
-                bool canUseIndexOf = primarySet.Set != RegexCharClass.NotNewLineClass;
+                bool canUseIndexOf =
+                    primarySet.Set != RegexCharClass.NotNewLineClass &&
+                    primarySet.Set != RegexCharClass.AnyClass;
                  bool needLoop = !canUseIndexOf || setsToUse > 1;
  
                  Label checkSpanLengthLabel = default;
@@ -6098,10 +6104,19 @@ namespace System.Text.RegularExpressions
              int index = list.Count;
              list.Add(IndexOfAnyValues.Create(chars));
  
-            // this._indexOfAnyValues[index]
+            // Logically do _indexOfAnyValues[index], but avoid the bounds check on accessing the array,
+            // and cast to the known derived sealed type to enable devirtualization.
+
+            // DerivedIndexOfAnyValues d = Unsafe.Add(ref MemoryMarshal.GetArrayDataReference(this._indexOfAnyValues), index);
+            // ... = d;
              Ldthisfld(s_indexOfAnyValuesArrayField);
-            Ldc(index);
-            _ilg!.Emit(OpCodes.Ldelem_Ref);
+            Call(s_memoryMarshalGetArrayDataReferenceIndexOfAnyValues);
+            Ldc(index * IntPtr.Size);
+            Add();
+            _ilg!.Emit(OpCodes.Ldind_Ref);
+            LocalBuilder ioavLocal = _ilg!.DeclareLocal(list[index].GetType());
+            Stloc(ioavLocal);
+            Ldloc(ioavLocal);
          }
      }
  }
author	Stephen Toub <stoub@microsoft.com>
	Mon, 10 Apr 2023 16:31:44 +0000 (12:31 -0400)
committer	GitHub <noreply@github.com>
	Mon, 10 Apr 2023 16:31:44 +0000 (12:31 -0400)
src/libraries/System.Data.Odbc/src/System.Data.Odbc.csproj		patch \| blob \| history
src/libraries/System.Data.OleDb/src/System.Data.OleDb.csproj		patch \| blob \| history
src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs		patch \| blob \| history
src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs		patch \| blob \| history
src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs		patch \| blob \| history