From f1ac00c9b0d17e48f464709fc554ebf73f165158 Mon Sep 17 00:00:00 2001
From: Haojian Wu <hokein.wu@gmail.com>
Date: Thu, 9 Jun 2022 11:42:29 +0200
Subject: [PATCH] [pseudo] Add grammar annotations support.

Add annotation handling ([key=value]) in the BNF grammar parser, which
will be used in the conditional reduction, and error recovery.

Reviewed By: sammccall

Differential Revision: https://reviews.llvm.org/D126536
---
 .../pseudo/include/clang-pseudo/Grammar.h          | 32 +++++++++++-
 clang-tools-extra/pseudo/lib/grammar/Grammar.cpp   |  2 +
 .../pseudo/lib/grammar/GrammarBNF.cpp              | 59 +++++++++++++++++++++-
 clang-tools-extra/pseudo/unittests/GrammarTest.cpp | 21 +++++++-
 4 files changed, 110 insertions(+), 4 deletions(-)
diff --git a/clang-tools-extra/pseudo/include/clang-pseudo/Grammar.h b/clang-tools-extra/pseudo/include/clang-pseudo/Grammar.h
index df757c6..fff96e9 100644
--- a/clang-tools-extra/pseudo/include/clang-pseudo/Grammar.h
+++ b/clang-tools-extra/pseudo/include/clang-pseudo/Grammar.h
@@ -19,6 +19,22 @@
 //  production rules. A rule is of BNF form (AAA := BBB CCC). A symbol is either
 //  nonterminal or terminal, identified by a SymbolID.
 //
+//  Annotations are supported in a syntax form of [key=value]. They specify
+//  attributes which are associated with either a grammar symbol (on the
+//  right-hand side of the symbol) or a grammar rule (at the end of the rule
+//  body).
+//  Attributes provide a way to inject custom code into the GLR parser. Each
+//  unique attribute value creates an extension point (identified by ExtensionID
+//  ), and an extension point corresponds to a piece of native code. For
+//  example, C++ grammar has a rule:
+//
+//    contextual-override := IDENTIFIER [guard=Override]
+//
+//  GLR parser only conducts the reduction of the rule if the IDENTIFIER
+//  content is `override`. This Override guard is implemented in CXX.cpp by
+//  binding the ExtensionID for the `Override` value to a specific C++ function
+//  that performs the check.
+//
 //  Notions about the BNF grammar:
 //  - "_" is the start symbol of the augmented grammar;
 //  - single-line comment is supported, starting with a #
@@ -69,6 +85,11 @@ inline SymbolID tokenSymbol(tok::TokenKind TK) {
   return TokenFlag | static_cast<SymbolID>(TK);
 }
 
+// An extension is a piece of native code specific to a grammar that modifies
+// the behavior of annotated rules. One ExtensionID is assigned for each unique
+// attribute value (all attributes share a namespace).
+using ExtensionID = uint16_t;
+
 // A RuleID uniquely identifies a production rule in a grammar.
 // It is an index into a table of rules.
 using RuleID = uint16_t;
@@ -96,11 +117,17 @@ struct Rule {
   uint8_t Size : SizeBits; // Size of the Sequence
   SymbolID Sequence[MaxElements];
 
+  // A guard extension controls whether a reduction of a rule will be conducted
+  // by the GLR parser.
+  // 0 is sentinel unset extension ID, indicating there is no guard extension
+  // being set for this rule.
+  ExtensionID Guard = 0;
+
   llvm::ArrayRef<SymbolID> seq() const {
     return llvm::ArrayRef<SymbolID>(Sequence, Size);
   }
   friend bool operator==(const Rule &L, const Rule &R) {
-    return L.Target == R.Target && L.seq() == R.seq();
+    return L.Target == R.Target && L.seq() == R.seq() && L.Guard == R.Guard;
   }
 };
 
@@ -186,6 +213,9 @@ struct GrammarTable {
   // A table of nonterminals, sorted by name.
   // SymbolID is the index of the table.
   std::vector<Nonterminal> Nonterminals;
+  // A table of attribute values, sorted by name.
+  // ExtensionID is the index of the table.
+  std::vector<std::string> AttributeValues;
 };
 
 } // namespace pseudo
diff --git a/clang-tools-extra/pseudo/lib/grammar/Grammar.cpp b/clang-tools-extra/pseudo/lib/grammar/Grammar.cpp
index 2be34f8..1763463 100644
--- a/clang-tools-extra/pseudo/lib/grammar/Grammar.cpp
+++ b/clang-tools-extra/pseudo/lib/grammar/Grammar.cpp
@@ -61,6 +61,8 @@ std::string Grammar::dumpRule(RuleID RID) const {
   OS << symbolName(R.Target) << " :=";
   for (SymbolID SID : R.seq())
     OS << " " << symbolName(SID);
+  if (R.Guard)
+    OS << " [guard=" << T->AttributeValues[R.Guard] << "]";
   return Result;
 }
 
diff --git a/clang-tools-extra/pseudo/lib/grammar/GrammarBNF.cpp b/clang-tools-extra/pseudo/lib/grammar/GrammarBNF.cpp
index f581adb..82daf47 100644
--- a/clang-tools-extra/pseudo/lib/grammar/GrammarBNF.cpp
+++ b/clang-tools-extra/pseudo/lib/grammar/GrammarBNF.cpp
@@ -47,6 +47,9 @@ public:
     // Assemble the name->ID and ID->nonterminal name maps.
     llvm::DenseSet<llvm::StringRef> UniqueNonterminals;
     llvm::DenseMap<llvm::StringRef, SymbolID> SymbolIds;
+
+    llvm::DenseSet<llvm::StringRef> UniqueAttributeValues;
+
     for (uint16_t I = 0; I < NumTerminals; ++I)
       SymbolIds.try_emplace(T->Terminals[I], tokenSymbol(tok::TokenKind(I)));
     auto Consider = [&](llvm::StringRef Name) {
@@ -55,8 +58,11 @@ public:
     };
     for (const auto &Spec : Specs) {
       Consider(Spec.Target);
-      for (const RuleSpec::Element &Elt : Spec.Sequence)
+      for (const RuleSpec::Element &Elt : Spec.Sequence) {
         Consider(Elt.Symbol);
+        for (const auto& KV : Elt.Attributes)
+           UniqueAttributeValues.insert(KV.second);
+      }
     }
     llvm::for_each(UniqueNonterminals, [&T](llvm::StringRef Name) {
       T->Nonterminals.emplace_back();
@@ -68,6 +74,15 @@ public:
                                    const GrammarTable::Nonterminal &R) {
       return L.Name < R.Name;
     });
+    // Add an empty string for the corresponding sentinel unset attribute.
+    T->AttributeValues.push_back("");
+    llvm::for_each(UniqueAttributeValues, [&T](llvm::StringRef Name) {
+      T->AttributeValues.emplace_back();
+      T->AttributeValues.back() = Name.str();
+    });
+    llvm::sort(T->AttributeValues);
+    assert(T->AttributeValues.front() == "");
+
     // Build name -> ID maps for nonterminals.
     for (SymbolID SID = 0; SID < T->Nonterminals.size(); ++SID)
       SymbolIds.try_emplace(T->Nonterminals[SID].Name, SID);
@@ -86,7 +101,9 @@ public:
       for (const RuleSpec::Element &Elt : Spec.Sequence)
         Symbols.push_back(Lookup(Elt.Symbol));
       T->Rules.push_back(Rule(Lookup(Spec.Target), Symbols));
+      applyAttributes(Spec, *T, T->Rules.back());
     }
+
     assert(T->Rules.size() < (1 << RuleBits) &&
            "Too many rules to fit in RuleID bits!");
     const auto &SymbolOrder = getTopologicalOrder(T.get());
@@ -164,6 +181,9 @@ private:
     llvm::StringRef Target;
     struct Element {
       llvm::StringRef Symbol; // Name of the symbol
+      // Attributes that are associated to the sequence symbol or rule.
+      std::vector<std::pair<llvm::StringRef/*Key*/, llvm::StringRef/*Value*/>>
+          Attributes;
     };
     std::vector<Element> Sequence;
 
@@ -204,11 +224,46 @@ private:
       Chunk = Chunk.trim();
       if (Chunk.empty())
         continue; // skip empty
+      if (Chunk.startswith("[") && Chunk.endswith("]")) {
+        if (Out.Sequence.empty())
+          continue;
+
+        parseAttributes(Chunk, Out.Sequence.back().Attributes);
+        continue;
+      }
 
       Out.Sequence.push_back({Chunk});
     }
     return true;
-  };
+  }
+
+  bool parseAttributes(
+      llvm::StringRef Content,
+      std::vector<std::pair<llvm::StringRef, llvm::StringRef>> &Out) {
+    assert(Content.startswith("[") && Content.endswith("]"));
+    auto KV = Content.drop_front().drop_back().split('=');
+    Out.push_back({KV.first, KV.second.trim()});
+
+    return true;
+  }
+  // Apply the parsed extensions (stored in RuleSpec) to the grammar Rule.
+  void applyAttributes(const RuleSpec& Spec, const GrammarTable& T, Rule& R) {
+    auto LookupExtensionID = [&T](llvm::StringRef Name) {
+      const auto It = llvm::partition_point(
+          T.AttributeValues, [&](llvm::StringRef X) { return X < Name; });
+      assert(It != T.AttributeValues.end() && *It == Name &&
+             "Didn't find the attribute in AttrValues!");
+      return It - T.AttributeValues.begin();
+    };
+    for (const auto &KV : Spec.Sequence.back().Attributes) {
+      if (KV.first == "guard") {
+        R.Guard = LookupExtensionID(KV.second);
+        continue;
+      }
+      Diagnostics.push_back(
+          llvm::formatv("Unknown attribute '{0}'", KV.first).str());
+    }
+  }
 
   // Inlines all _opt symbols.
   // For example, a rule E := id +_opt id, after elimination, we have two
diff --git a/clang-tools-extra/pseudo/unittests/GrammarTest.cpp b/clang-tools-extra/pseudo/unittests/GrammarTest.cpp
index 0f71c47..cc72ca2 100644
--- a/clang-tools-extra/pseudo/unittests/GrammarTest.cpp
+++ b/clang-tools-extra/pseudo/unittests/GrammarTest.cpp
@@ -99,6 +99,22 @@ TEST_F(GrammarTest, RuleIDSorted) {
   EXPECT_LT(ruleFor("x"), ruleFor("_"));
 }
 
+TEST_F(GrammarTest, Annotation) {
+  build(R"bnf(
+    _ := x
+
+    x := y [guard=value]
+    y := IDENTIFIER [guard=final]
+
+  )bnf");
+  ASSERT_TRUE(Diags.empty());
+  EXPECT_EQ(G->lookupRule(ruleFor("_")).Guard, 0);
+  EXPECT_GT(G->lookupRule(ruleFor("x")).Guard, 0);
+  EXPECT_GT(G->lookupRule(ruleFor("y")).Guard, 0);
+  EXPECT_NE(G->lookupRule(ruleFor("x")).Guard,
+            G->lookupRule(ruleFor("y")).Guard);
+}
+
 TEST_F(GrammarTest, Diagnostics) {
   build(R"cpp(
     _ := ,_opt
@@ -110,6 +126,8 @@ TEST_F(GrammarTest, Diagnostics) {
     # cycle
     a := b
     b := a
+
+    _ := IDENTIFIER [unknown=value]
   )cpp");
 
   EXPECT_EQ(G->underscore(), id("_"));
@@ -120,7 +138,8 @@ TEST_F(GrammarTest, Diagnostics) {
                          "Failed to parse 'invalid': no separator :=",
                          "Token-like name IDENFIFIE is used as a nonterminal",
                          "No rules for nonterminal: IDENFIFIE",
-                         "The grammar contains a cycle involving symbol a"));
+                         "The grammar contains a cycle involving symbol a",
+                         "Unknown attribute 'unknown'"));
 }
 
 TEST_F(GrammarTest, FirstAndFollowSets) {
-- 
2.7.4