Improve incremental handling in regex generator (#83868)
authorStephen Toub <stoub@microsoft.com>
Fri, 24 Mar 2023 21:05:41 +0000 (17:05 -0400)
committerGitHub <noreply@github.com>
Fri, 24 Mar 2023 21:05:41 +0000 (17:05 -0400)
* Improve incrementalism:
- Don't calculate the tree or analysis inside the semantic info.
- Dont' store methodDeclartionSyntax and store a comparable across runs location instead.

* Add comments, clean up, and simplify parser

---------

Co-authored-by: Chris Sienkiewicz <chsienki@microsoft.com>
src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Parser.cs
src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.cs

index e403461..d7e5d4a 100644 (file)
@@ -2,6 +2,7 @@
 // The .NET Foundation licenses this file to you under the MIT license.
 
 using System.Collections.Immutable;
+using System.Diagnostics;
 using System.Globalization;
 using System.Linq;
 using System.Threading;
@@ -19,7 +20,7 @@ namespace System.Text.RegularExpressions.Generator
         private const string GeneratedRegexAttributeName = "System.Text.RegularExpressions.GeneratedRegexAttribute";
 
         // Returns null if nothing to do, Diagnostic if there's an error to report, or RegexType if the type was analyzed successfully.
-        private static object? GetSemanticTargetForGeneration(
+        private static object? GetRegexMethodDataOrFailureDiagnostic(
             GeneratorAttributeSyntaxContext context, CancellationToken cancellationToken)
         {
             var methodSyntax = (MethodDeclarationSyntax)context.TargetNode;
@@ -27,9 +28,8 @@ namespace System.Text.RegularExpressions.Generator
 
             Compilation compilation = sm.Compilation;
             INamedTypeSymbol? regexSymbol = compilation.GetBestTypeByMetadataName(RegexName);
-            INamedTypeSymbol? generatedRegexAttributeSymbol = compilation.GetBestTypeByMetadataName(GeneratedRegexAttributeName);
 
-            if (regexSymbol is null || generatedRegexAttributeSymbol is null)
+            if (regexSymbol is null)
             {
                 // Required types aren't available
                 return null;
@@ -47,71 +47,51 @@ namespace System.Text.RegularExpressions.Generator
                 return null;
             }
 
-            ImmutableArray<AttributeData>? boundAttributes = regexMethodSymbol.GetAttributes();
-            if (boundAttributes is null || boundAttributes.Value.Length == 0)
+            ImmutableArray<AttributeData> boundAttributes = context.Attributes;
+            if (boundAttributes.Length != 1)
             {
-                return null;
+                return Diagnostic.Create(DiagnosticDescriptors.MultipleGeneratedRegexAttributes, methodSyntax.GetLocation());
             }
+            AttributeData generatedRegexAttr = boundAttributes[0];
 
-            bool attributeFound = false;
-            string? pattern = null;
+            if (generatedRegexAttr.ConstructorArguments.Any(ca => ca.Kind == TypedConstantKind.Error))
+            {
+                return Diagnostic.Create(DiagnosticDescriptors.InvalidGeneratedRegexAttribute, methodSyntax.GetLocation());
+            }
+
+            ImmutableArray<TypedConstant> items = generatedRegexAttr.ConstructorArguments;
+            if (items.Length is 0 or > 4)
+            {
+                return Diagnostic.Create(DiagnosticDescriptors.InvalidGeneratedRegexAttribute, methodSyntax.GetLocation());
+            }
+
+            string? pattern = items[0].Value as string;
             int? options = null;
             int? matchTimeout = null;
             string? cultureName = string.Empty;
-            foreach (AttributeData attributeData in boundAttributes)
+            if (items.Length >= 2)
             {
-                if (!SymbolEqualityComparer.Default.Equals(attributeData.AttributeClass, generatedRegexAttributeSymbol))
+                options = items[1].Value as int?;
+                if (items.Length == 4)
                 {
-                    continue;
+                    matchTimeout = items[2].Value as int?;
+                    cultureName = items[3].Value as string;
                 }
-
-                if (attributeData.ConstructorArguments.Any(ca => ca.Kind == TypedConstantKind.Error))
-                {
-                    return Diagnostic.Create(DiagnosticDescriptors.InvalidGeneratedRegexAttribute, methodSyntax.GetLocation());
-                }
-
-                if (pattern is not null)
-                {
-                    return Diagnostic.Create(DiagnosticDescriptors.MultipleGeneratedRegexAttributes, methodSyntax.GetLocation());
-                }
-
-                ImmutableArray<TypedConstant> items = attributeData.ConstructorArguments;
-                if (items.Length == 0 || items.Length > 4)
-                {
-                    return Diagnostic.Create(DiagnosticDescriptors.InvalidGeneratedRegexAttribute, methodSyntax.GetLocation());
-                }
-
-                attributeFound = true;
-                pattern = items[0].Value as string;
-                if (items.Length >= 2)
+                // If there are 3 parameters, we need to check if the third argument is
+                // int matchTimeoutMilliseconds, or string cultureName.
+                else if (items.Length == 3)
                 {
-                    options = items[1].Value as int?;
-                    if (items.Length == 4)
+                    if (items[2].Type?.SpecialType == SpecialType.System_Int32)
                     {
                         matchTimeout = items[2].Value as int?;
-                        cultureName = items[3].Value as string;
                     }
-                    // If there are 3 parameters, we need to check if the third argument is
-                    // int matchTimeoutMilliseconds, or string cultureName.
-                    else if (items.Length == 3)
+                    else
                     {
-                        if (items[2].Type?.SpecialType == SpecialType.System_Int32)
-                        {
-                            matchTimeout = items[2].Value as int?;
-                        }
-                        else
-                        {
-                            cultureName = items[2].Value as string;
-                        }
+                        cultureName = items[2].Value as string;
                     }
                 }
             }
 
-            if (!attributeFound)
-            {
-                return null;
-            }
-
             if (pattern is null || cultureName is null)
             {
                 return Diagnostic.Create(DiagnosticDescriptors.InvalidRegexArguments, methodSyntax.GetLocation(), "(null)");
@@ -128,7 +108,7 @@ namespace System.Text.RegularExpressions.Generator
 
             RegexOptions regexOptions = options is not null ? (RegexOptions)options : RegexOptions.None;
 
-            // If  RegexOptions.IgnoreCase was specified or the inline ignore case option `(?i)` is present in the pattern, then we will (in priority order):
+            // If RegexOptions.IgnoreCase was specified or the inline ignore case option `(?i)` is present in the pattern, then we will (in priority order):
             // - If a culture name was passed in:
             //   - If RegexOptions.CultureInvariant was also passed in, then we emit a diagnostic due to the explicit conflict.
             //   - We try to initialize a culture using the passed in culture name to be used for case-sensitive comparisons. If
@@ -186,19 +166,6 @@ namespace System.Text.RegularExpressions.Generator
                 return Diagnostic.Create(DiagnosticDescriptors.InvalidRegexArguments, methodSyntax.GetLocation(), "matchTimeout");
             }
 
-            // Parse the input pattern
-            RegexTree regexTree;
-            AnalysisResults analysis;
-            try
-            {
-                regexTree = RegexParser.Parse(pattern, regexOptions | RegexOptions.Compiled, culture); // make sure Compiled is included to get all optimizations applied to it
-                analysis = RegexTreeAnalyzer.Analyze(regexTree);
-            }
-            catch (Exception e)
-            {
-                return Diagnostic.Create(DiagnosticDescriptors.InvalidRegexArguments, methodSyntax.GetLocation(), e.Message);
-            }
-
             // Determine the namespace the class is declared in, if any
             string? ns = regexMethodSymbol.ContainingType?.ContainingNamespace?.ToDisplayString(
                 SymbolDisplayFormat.FullyQualifiedFormat.WithGlobalNamespaceStyle(SymbolDisplayGlobalNamespaceStyle.Omitted));
@@ -208,16 +175,15 @@ namespace System.Text.RegularExpressions.Generator
                 ns ?? string.Empty,
                 $"{typeDec.Identifier}{typeDec.TypeParameterList}");
 
-            var regexMethod = new RegexMethod(
+            var result = new RegexPatternAndSyntax(
                 regexType,
-                methodSyntax,
+                GetComparableLocation(methodSyntax),
                 regexMethodSymbol.Name,
                 methodSyntax.Modifiers.ToString(),
                 pattern,
                 regexOptions,
                 matchTimeout,
-                regexTree,
-                analysis);
+                culture);
 
             RegexType current = regexType;
             var parent = typeDec.Parent as TypeDeclarationSyntax;
@@ -233,7 +199,7 @@ namespace System.Text.RegularExpressions.Generator
                 parent = parent.Parent as TypeDeclarationSyntax;
             }
 
-            return regexMethod;
+            return result;
 
             static bool IsAllowedKind(SyntaxKind kind) =>
                 kind == SyntaxKind.ClassDeclaration ||
@@ -241,10 +207,22 @@ namespace System.Text.RegularExpressions.Generator
                 kind == SyntaxKind.RecordDeclaration ||
                 kind == SyntaxKind.RecordStructDeclaration ||
                 kind == SyntaxKind.InterfaceDeclaration;
+
+
+            // Get a Location object that doesn't store a reference to the compilation.
+            // That allows it to compare equally across compilations.
+            static Location GetComparableLocation(MethodDeclarationSyntax method)
+            {
+                var location = method.GetLocation();
+                return Location.Create(location.SourceTree?.FilePath ?? string.Empty, location.SourceSpan, location.GetLineSpan().Span);
+            }
         }
 
-        /// <summary>A regex method.</summary>
-        internal sealed record RegexMethod(RegexType DeclaringType, MethodDeclarationSyntax MethodSyntax, string MethodName, string Modifiers, string Pattern, RegexOptions Options, int? MatchTimeout, RegexTree Tree, AnalysisResults Analysis)
+        /// <summary>Data about a regex directly from the GeneratedRegex attribute.</summary>
+        internal sealed record RegexPatternAndSyntax(RegexType DeclaringType, Location DiagnosticLocation, string MethodName, string Modifiers, string Pattern, RegexOptions Options, int? MatchTimeout, CultureInfo Culture);
+
+        /// <summary>Data about a regex, including a fully parsed RegexTree and subsequent analysis.</summary>
+        internal sealed record RegexMethod(RegexType DeclaringType, Location DiagnosticLocation, string MethodName, string Modifiers, string Pattern, RegexOptions Options, int? MatchTimeout, RegexTree Tree, AnalysisResults Analysis)
         {
             public string? GeneratedName { get; set; }
             public bool IsDuplicate { get; set; }
index ada46b3..17a418b 100644 (file)
@@ -57,13 +57,43 @@ namespace System.Text.RegularExpressions.Generator
                 context.SyntaxProvider
 
                 // Find all MethodDeclarationSyntax nodes attributed with GeneratedRegex and gather the required information.
+                // The predicate will be run once for every attributed node in the same file that's being modified.
+                // The transform will be run once for every attributed node in the compilation.
+                // Thus, both should do the minimal amount of work required and get out.  This should also have extracted
+                // everything from the target necessary to do all subsequent analysis and should return an object that's
+                // meaningfully comparable and that doesn't reference anything from the compilation: we want to ensure
+                // that any successful cached results are idempotent for the input such that they don't trigger downstream work
+                // if there are no changes.
                 .ForAttributeWithMetadataName(
                     GeneratedRegexAttributeName,
                     (node, _) => node is MethodDeclarationSyntax,
-                    GetSemanticTargetForGeneration)
+                    GetRegexMethodDataOrFailureDiagnostic)
+
+                // Filter out any parsing errors that resulted in null objects being returned.
                 .Where(static m => m is not null)
 
-                // Incorporate the compilation data, as it impacts code generation.
+                // The input here will either be a Diagnostic (in the case of something erroneous detected in GetRegexMethodDataOrFailureDiagnostic)
+                // or it will be a RegexPatternAndSyntax containing all of the successfully parsed data from the attribute/method.
+                .Select((methodOrDiagnostic, _) =>
+                {
+                    if (methodOrDiagnostic is RegexPatternAndSyntax method)
+                    {
+                        try
+                        {
+                            RegexTree regexTree = RegexParser.Parse(method.Pattern, method.Options | RegexOptions.Compiled, method.Culture); // make sure Compiled is included to get all optimizations applied to it
+                            AnalysisResults analysis = RegexTreeAnalyzer.Analyze(regexTree);
+                            return new RegexMethod(method.DeclaringType, method.DiagnosticLocation, method.MethodName, method.Modifiers, method.Pattern, method.Options, method.MatchTimeout, regexTree, analysis);
+                        }
+                        catch (Exception e)
+                        {
+                            return Diagnostic.Create(DiagnosticDescriptors.InvalidRegexArguments, method.DiagnosticLocation, e.Message);
+                        }
+                    }
+
+                    return methodOrDiagnostic;
+                })
+
+                // Now incorporate the compilation data, as it impacts code generation.
                 .Combine(compilationDataProvider)
 
                 // Generate the RunnerFactory for each regex, if possible.  This is where the bulk of the implementation occurs.
@@ -80,7 +110,7 @@ namespace System.Text.RegularExpressions.Generator
                     // We'll still output a limited implementation that just caches a new Regex(...).
                     if (!SupportsCodeGeneration(regexMethod, methodOrDiagnosticAndCompilationData.Right.LanguageVersion, out string? reason))
                     {
-                        return (regexMethod, reason, Diagnostic.Create(DiagnosticDescriptors.LimitedSourceGeneration, regexMethod.MethodSyntax.GetLocation()), methodOrDiagnosticAndCompilationData.Right);
+                        return (regexMethod, reason, Diagnostic.Create(DiagnosticDescriptors.LimitedSourceGeneration, regexMethod.DiagnosticLocation), methodOrDiagnosticAndCompilationData.Right);
                     }
 
                     // Generate the core logic for the regex.
@@ -93,6 +123,8 @@ namespace System.Text.RegularExpressions.Generator
                     writer.Indent -= 2;
                     return (regexMethod, sw.ToString(), requiredHelpers, methodOrDiagnosticAndCompilationData.Right);
                 })
+
+                // Combine all of the generated text outputs into a single batch. We then generate a single source output from that batch.
                 .Collect();
 
             // When there something to output, take all the generated strings and concatenate them to output,