From f1ac00c9b0d17e48f464709fc554ebf73f165158 Mon Sep 17 00:00:00 2001 From: Haojian Wu Date: Thu, 9 Jun 2022 11:42:29 +0200 Subject: [PATCH] [pseudo] Add grammar annotations support. Add annotation handling ([key=value]) in the BNF grammar parser, which will be used in the conditional reduction, and error recovery. Reviewed By: sammccall Differential Revision: https://reviews.llvm.org/D126536 --- .../pseudo/include/clang-pseudo/Grammar.h | 32 +++++++++++- clang-tools-extra/pseudo/lib/grammar/Grammar.cpp | 2 + .../pseudo/lib/grammar/GrammarBNF.cpp | 59 +++++++++++++++++++++- clang-tools-extra/pseudo/unittests/GrammarTest.cpp | 21 +++++++- 4 files changed, 110 insertions(+), 4 deletions(-) diff --git a/clang-tools-extra/pseudo/include/clang-pseudo/Grammar.h b/clang-tools-extra/pseudo/include/clang-pseudo/Grammar.h index df757c6..fff96e9 100644 --- a/clang-tools-extra/pseudo/include/clang-pseudo/Grammar.h +++ b/clang-tools-extra/pseudo/include/clang-pseudo/Grammar.h @@ -19,6 +19,22 @@ // production rules. A rule is of BNF form (AAA := BBB CCC). A symbol is either // nonterminal or terminal, identified by a SymbolID. // +// Annotations are supported in a syntax form of [key=value]. They specify +// attributes which are associated with either a grammar symbol (on the +// right-hand side of the symbol) or a grammar rule (at the end of the rule +// body). +// Attributes provide a way to inject custom code into the GLR parser. Each +// unique attribute value creates an extension point (identified by ExtensionID +// ), and an extension point corresponds to a piece of native code. For +// example, C++ grammar has a rule: +// +// contextual-override := IDENTIFIER [guard=Override] +// +// GLR parser only conducts the reduction of the rule if the IDENTIFIER +// content is `override`. This Override guard is implemented in CXX.cpp by +// binding the ExtensionID for the `Override` value to a specific C++ function +// that performs the check. +// // Notions about the BNF grammar: // - "_" is the start symbol of the augmented grammar; // - single-line comment is supported, starting with a # @@ -69,6 +85,11 @@ inline SymbolID tokenSymbol(tok::TokenKind TK) { return TokenFlag | static_cast(TK); } +// An extension is a piece of native code specific to a grammar that modifies +// the behavior of annotated rules. One ExtensionID is assigned for each unique +// attribute value (all attributes share a namespace). +using ExtensionID = uint16_t; + // A RuleID uniquely identifies a production rule in a grammar. // It is an index into a table of rules. using RuleID = uint16_t; @@ -96,11 +117,17 @@ struct Rule { uint8_t Size : SizeBits; // Size of the Sequence SymbolID Sequence[MaxElements]; + // A guard extension controls whether a reduction of a rule will be conducted + // by the GLR parser. + // 0 is sentinel unset extension ID, indicating there is no guard extension + // being set for this rule. + ExtensionID Guard = 0; + llvm::ArrayRef seq() const { return llvm::ArrayRef(Sequence, Size); } friend bool operator==(const Rule &L, const Rule &R) { - return L.Target == R.Target && L.seq() == R.seq(); + return L.Target == R.Target && L.seq() == R.seq() && L.Guard == R.Guard; } }; @@ -186,6 +213,9 @@ struct GrammarTable { // A table of nonterminals, sorted by name. // SymbolID is the index of the table. std::vector Nonterminals; + // A table of attribute values, sorted by name. + // ExtensionID is the index of the table. + std::vector AttributeValues; }; } // namespace pseudo diff --git a/clang-tools-extra/pseudo/lib/grammar/Grammar.cpp b/clang-tools-extra/pseudo/lib/grammar/Grammar.cpp index 2be34f8..1763463 100644 --- a/clang-tools-extra/pseudo/lib/grammar/Grammar.cpp +++ b/clang-tools-extra/pseudo/lib/grammar/Grammar.cpp @@ -61,6 +61,8 @@ std::string Grammar::dumpRule(RuleID RID) const { OS << symbolName(R.Target) << " :="; for (SymbolID SID : R.seq()) OS << " " << symbolName(SID); + if (R.Guard) + OS << " [guard=" << T->AttributeValues[R.Guard] << "]"; return Result; } diff --git a/clang-tools-extra/pseudo/lib/grammar/GrammarBNF.cpp b/clang-tools-extra/pseudo/lib/grammar/GrammarBNF.cpp index f581adb..82daf47 100644 --- a/clang-tools-extra/pseudo/lib/grammar/GrammarBNF.cpp +++ b/clang-tools-extra/pseudo/lib/grammar/GrammarBNF.cpp @@ -47,6 +47,9 @@ public: // Assemble the name->ID and ID->nonterminal name maps. llvm::DenseSet UniqueNonterminals; llvm::DenseMap SymbolIds; + + llvm::DenseSet UniqueAttributeValues; + for (uint16_t I = 0; I < NumTerminals; ++I) SymbolIds.try_emplace(T->Terminals[I], tokenSymbol(tok::TokenKind(I))); auto Consider = [&](llvm::StringRef Name) { @@ -55,8 +58,11 @@ public: }; for (const auto &Spec : Specs) { Consider(Spec.Target); - for (const RuleSpec::Element &Elt : Spec.Sequence) + for (const RuleSpec::Element &Elt : Spec.Sequence) { Consider(Elt.Symbol); + for (const auto& KV : Elt.Attributes) + UniqueAttributeValues.insert(KV.second); + } } llvm::for_each(UniqueNonterminals, [&T](llvm::StringRef Name) { T->Nonterminals.emplace_back(); @@ -68,6 +74,15 @@ public: const GrammarTable::Nonterminal &R) { return L.Name < R.Name; }); + // Add an empty string for the corresponding sentinel unset attribute. + T->AttributeValues.push_back(""); + llvm::for_each(UniqueAttributeValues, [&T](llvm::StringRef Name) { + T->AttributeValues.emplace_back(); + T->AttributeValues.back() = Name.str(); + }); + llvm::sort(T->AttributeValues); + assert(T->AttributeValues.front() == ""); + // Build name -> ID maps for nonterminals. for (SymbolID SID = 0; SID < T->Nonterminals.size(); ++SID) SymbolIds.try_emplace(T->Nonterminals[SID].Name, SID); @@ -86,7 +101,9 @@ public: for (const RuleSpec::Element &Elt : Spec.Sequence) Symbols.push_back(Lookup(Elt.Symbol)); T->Rules.push_back(Rule(Lookup(Spec.Target), Symbols)); + applyAttributes(Spec, *T, T->Rules.back()); } + assert(T->Rules.size() < (1 << RuleBits) && "Too many rules to fit in RuleID bits!"); const auto &SymbolOrder = getTopologicalOrder(T.get()); @@ -164,6 +181,9 @@ private: llvm::StringRef Target; struct Element { llvm::StringRef Symbol; // Name of the symbol + // Attributes that are associated to the sequence symbol or rule. + std::vector> + Attributes; }; std::vector Sequence; @@ -204,11 +224,46 @@ private: Chunk = Chunk.trim(); if (Chunk.empty()) continue; // skip empty + if (Chunk.startswith("[") && Chunk.endswith("]")) { + if (Out.Sequence.empty()) + continue; + + parseAttributes(Chunk, Out.Sequence.back().Attributes); + continue; + } Out.Sequence.push_back({Chunk}); } return true; - }; + } + + bool parseAttributes( + llvm::StringRef Content, + std::vector> &Out) { + assert(Content.startswith("[") && Content.endswith("]")); + auto KV = Content.drop_front().drop_back().split('='); + Out.push_back({KV.first, KV.second.trim()}); + + return true; + } + // Apply the parsed extensions (stored in RuleSpec) to the grammar Rule. + void applyAttributes(const RuleSpec& Spec, const GrammarTable& T, Rule& R) { + auto LookupExtensionID = [&T](llvm::StringRef Name) { + const auto It = llvm::partition_point( + T.AttributeValues, [&](llvm::StringRef X) { return X < Name; }); + assert(It != T.AttributeValues.end() && *It == Name && + "Didn't find the attribute in AttrValues!"); + return It - T.AttributeValues.begin(); + }; + for (const auto &KV : Spec.Sequence.back().Attributes) { + if (KV.first == "guard") { + R.Guard = LookupExtensionID(KV.second); + continue; + } + Diagnostics.push_back( + llvm::formatv("Unknown attribute '{0}'", KV.first).str()); + } + } // Inlines all _opt symbols. // For example, a rule E := id +_opt id, after elimination, we have two diff --git a/clang-tools-extra/pseudo/unittests/GrammarTest.cpp b/clang-tools-extra/pseudo/unittests/GrammarTest.cpp index 0f71c47..cc72ca2 100644 --- a/clang-tools-extra/pseudo/unittests/GrammarTest.cpp +++ b/clang-tools-extra/pseudo/unittests/GrammarTest.cpp @@ -99,6 +99,22 @@ TEST_F(GrammarTest, RuleIDSorted) { EXPECT_LT(ruleFor("x"), ruleFor("_")); } +TEST_F(GrammarTest, Annotation) { + build(R"bnf( + _ := x + + x := y [guard=value] + y := IDENTIFIER [guard=final] + + )bnf"); + ASSERT_TRUE(Diags.empty()); + EXPECT_EQ(G->lookupRule(ruleFor("_")).Guard, 0); + EXPECT_GT(G->lookupRule(ruleFor("x")).Guard, 0); + EXPECT_GT(G->lookupRule(ruleFor("y")).Guard, 0); + EXPECT_NE(G->lookupRule(ruleFor("x")).Guard, + G->lookupRule(ruleFor("y")).Guard); +} + TEST_F(GrammarTest, Diagnostics) { build(R"cpp( _ := ,_opt @@ -110,6 +126,8 @@ TEST_F(GrammarTest, Diagnostics) { # cycle a := b b := a + + _ := IDENTIFIER [unknown=value] )cpp"); EXPECT_EQ(G->underscore(), id("_")); @@ -120,7 +138,8 @@ TEST_F(GrammarTest, Diagnostics) { "Failed to parse 'invalid': no separator :=", "Token-like name IDENFIFIE is used as a nonterminal", "No rules for nonterminal: IDENFIFIE", - "The grammar contains a cycle involving symbol a")); + "The grammar contains a cycle involving symbol a", + "Unknown attribute 'unknown'")); } TEST_F(GrammarTest, FirstAndFollowSets) { -- 2.7.4