From: Haojian Wu Date: Wed, 19 Jan 2022 12:14:57 +0000 (+0100) Subject: [syntax][pseudo] Add Grammar for the clang pseudo-parser X-Git-Tag: upstream/15.0.7~17962 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=20e05b9f0ebea35076b96c89257becd35d6de859;p=platform%2Fupstream%2Fllvm.git [syntax][pseudo] Add Grammar for the clang pseudo-parser This patch introduces the Grammar class, which is a critial piece for constructing a tabled-based parser. As the first patch, the scope is limited to: - define base types (symbol, rules) of modeling the grammar - construct Grammar by parsing the BNF file (annotations are excluded for now) Differential Revision: https://reviews.llvm.org/D114790 --- diff --git a/clang/include/clang/Tooling/Syntax/Pseudo/Grammar.h b/clang/include/clang/Tooling/Syntax/Pseudo/Grammar.h new file mode 100644 index 0000000..80db9f2 --- /dev/null +++ b/clang/include/clang/Tooling/Syntax/Pseudo/Grammar.h @@ -0,0 +1,170 @@ +//===--- Grammar.h - grammar used by clang pseudo parser --------*- C++-*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file defines base structures for parsing & modeling a grammar for a +// programming language: +// +// # This is a fake C++ BNF grammar +// _ := translation-unit +// translation-unit := declaration-seq_opt +// declaration-seq := declaration +// declaration-seq := declaration-seq declaration +// +// A grammar formally describes a language, and it is constructed by a set of +// production rules. A rule is of BNF form (AAA := BBB CCC). A symbol is either +// non-terminal or terminal, identified by a SymbolID. +// +// Notions about the BNF grammar: +// - "_" is the augmented symbol, formed by start symbols. +// - single-line comment is supported, starting with a # +// - A rule describes how a nonterminal (left side of :=) is constructed, and +// it is *per line* in the grammar file +// - Terminals (also called tokens) correspond to the clang::TokenKind; they +// are written in the grammar like "IDENTIFIER", "USING", "+" +// - Nonterminals are specified with "lower-case" names in the grammar; they +// shouldn't be nullable (has an empty sequence) +// - optional symbols are supported (specified with a _opt suffix), and they +// will be eliminated during the grammar parsing stage +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CLANG_TOOLING_SYNTAX_GRAMMAR_H +#define LLVM_CLANG_TOOLING_SYNTAX_GRAMMAR_H + +#include "clang/Basic/TokenKinds.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/StringRef.h" +#include +#include + +namespace clang { +namespace syntax { +namespace pseudo { +// A SymbolID uniquely identifies a terminal/non-terminal symbol in a grammar. +// Non-terminal IDs are indexes into a table of non-terminal symbols. +// Terminal IDs correspond to the clang TokenKind enum. +using SymbolID = uint16_t; +// SymbolID is only 12 bits wide. +// There are maximum 2^11 terminals (aka tokens) and 2^11 nonterminals. +static constexpr uint16_t SymbolBits = 12; +static constexpr uint16_t NumTerminals = tok::NUM_TOKENS; +// SymbolIDs with the top bit set are tokens/terminals. +static constexpr SymbolID TokenFlag = 1 << (SymbolBits - 1); +inline bool isToken(SymbolID ID) { return ID & TokenFlag; } +inline bool isNonterminal(SymbolID ID) { return !isToken(ID); } +// The terminals are always the clang tok::TokenKind (not all are used). +inline tok::TokenKind symbolToToken(SymbolID SID) { + assert(isToken(SID)); + SID &= ~TokenFlag; + assert(SID < NumTerminals); + return static_cast(SID); +} +inline SymbolID tokenSymbol(tok::TokenKind TK) { + return TokenFlag | static_cast(TK); +} + +// A RuleID uniquely identifies a production rule in a grammar. +// It is an index into a table of rules. +using RuleID = uint16_t; +// There are maximum 2^12 rules. +static constexpr unsigned RuleBits = 12; + +// Represent a production rule in the grammar, e.g. +// expression := a b c +// ^Target ^Sequence +struct Rule { + Rule(SymbolID Target, llvm::ArrayRef Seq); + + // We occupy 4 bits for the sequence, in theory, it can be at most 2^4 tokens + // long, however, we're stricter in order to reduce the size, we limit the max + // lenth to 9 (this is the longest sequence in cxx grammar). + static constexpr unsigned SizeBits = 4; + static constexpr unsigned MaxElements = 9; + static_assert(MaxElements <= (1 << SizeBits), "Exceeds the maximum limit"); + static_assert(SizeBits + SymbolBits <= 16, + "Must be able to store symbol ID + size efficiently"); + + // 16 bits for target symbol and size of sequence: + // SymbolID : 12 | Size : 4 + SymbolID Target : SymbolBits; + uint8_t Size : SizeBits; // Size of the Sequence + SymbolID Sequence[MaxElements]; + + llvm::ArrayRef seq() const { + return llvm::ArrayRef(Sequence, Size); + } + friend bool operator==(const Rule &L, const Rule &R) { + return L.Target == R.Target && L.seq() == R.seq(); + } +}; + +struct GrammarTable; + +// Grammar that describes a programming language, e.g. C++. It represents the +// contents of the specified grammar. +// It is a building block for constructing a table-based parser. +class Grammar { +public: + explicit Grammar(std::unique_ptr T) : T(std::move(T)) {} + + // Parses grammar from a BNF file. + // Diagnostics emitted during parsing are stored in Diags. + static std::unique_ptr parseBNF(llvm::StringRef BNF, + std::vector &Diags); + + // Returns all rules of the given non-terminal symbol. + llvm::ArrayRef rulesFor(SymbolID SID) const; + const Rule &lookupRule(RuleID RID) const; + + // Gets symbol (terminal or non-terminal) name. + // Terminals have names like "," (kw_comma) or "OPERATOR" (kw_operator). + llvm::StringRef symbolName(SymbolID) const; + + // Dumps the whole grammar. + std::string dump() const; + // Dumps a particular rule. + std::string dumpRule(RuleID) const; + // Dumps all rules of the given nonterminal symbol. + std::string dumpRules(SymbolID) const; + + const GrammarTable &table() const { return *T; } + +private: + std::unique_ptr T; +}; + +// Storage for the underlying data of the Grammar. +// It can be constructed dynamically (from compiling BNF file) or statically +// (a compiled data-source). +struct GrammarTable { + struct Nonterminal { + std::string Name; + // Corresponding rules that construct the non-terminal, it is a [start, end) + // index range of the Rules table. + struct { + RuleID start; + RuleID end; + } RuleRange; + }; + + // The rules are sorted (and thus grouped) by target symbol. + // RuleID is the index of the vector. + std::vector Rules; + // A table of terminals (aka tokens). It correspond to the clang::Token. + // clang::tok::TokenKind is the index of the table. + std::vector Terminals; + // A table of nonterminals, sorted by name. + // SymbolID is the index of the table. + std::vector Nonterminals; +}; + +} // namespace pseudo +} // namespace syntax +} // namespace clang + +#endif // LLVM_CLANG_TOOLING_SYNTAX_GRAMMAR_H diff --git a/clang/lib/Tooling/Syntax/CMakeLists.txt b/clang/lib/Tooling/Syntax/CMakeLists.txt index e933fae..f8d9184 100644 --- a/clang/lib/Tooling/Syntax/CMakeLists.txt +++ b/clang/lib/Tooling/Syntax/CMakeLists.txt @@ -19,3 +19,5 @@ add_clang_library(clangToolingSyntax DEPENDS omp_gen ) + +add_subdirectory(Pseudo) diff --git a/clang/lib/Tooling/Syntax/Pseudo/CMakeLists.txt b/clang/lib/Tooling/Syntax/Pseudo/CMakeLists.txt new file mode 100644 index 0000000..77dce4b --- /dev/null +++ b/clang/lib/Tooling/Syntax/Pseudo/CMakeLists.txt @@ -0,0 +1,10 @@ +set(LLVM_LINK_COMPONENTS Support) + +add_clang_library(clangSyntaxPseudo + Grammar.cpp + GrammarBNF.cpp + + LINK_LIBS + clangBasic + clangLex + ) diff --git a/clang/lib/Tooling/Syntax/Pseudo/Grammar.cpp b/clang/lib/Tooling/Syntax/Pseudo/Grammar.cpp new file mode 100644 index 0000000..014e6b4 --- /dev/null +++ b/clang/lib/Tooling/Syntax/Pseudo/Grammar.cpp @@ -0,0 +1,77 @@ +//===--- Grammar.cpp - Grammar for clang pseudo parser ----------*- C++-*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "clang/Tooling/Syntax/Pseudo/Grammar.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/Support/FormatVariadic.h" +#include "llvm/Support/raw_ostream.h" + +namespace clang { +namespace syntax { +namespace pseudo { + +Rule::Rule(SymbolID Target, llvm::ArrayRef Sequence) + : Target(Target), Size(static_cast(Sequence.size())) { + assert(Sequence.size() <= Rule::MaxElements); + llvm::copy(Sequence, this->Sequence); +} + +llvm::ArrayRef Grammar::rulesFor(SymbolID SID) const { + assert(isNonterminal(SID)); + const auto &R = T->Nonterminals[SID].RuleRange; + assert(R.end <= T->Rules.size()); + return llvm::makeArrayRef(&T->Rules[R.start], R.end - R.start); +} + +const Rule &Grammar::lookupRule(RuleID RID) const { + assert(RID < T->Rules.size()); + return T->Rules[RID]; +} + +llvm::StringRef Grammar::symbolName(SymbolID SID) const { + if (isToken(SID)) + return T->Terminals[symbolToToken(SID)]; + return T->Nonterminals[SID].Name; +} + +std::string Grammar::dumpRule(RuleID RID) const { + std::string Result; + llvm::raw_string_ostream OS(Result); + const Rule &R = T->Rules[RID]; + OS << symbolName(R.Target) << " :="; + for (SymbolID SID : R.seq()) + OS << " " << symbolName(SID); + return Result; +} + +std::string Grammar::dumpRules(SymbolID SID) const { + assert(isNonterminal(SID)); + std::string Result; + const auto &Range = T->Nonterminals[SID].RuleRange; + for (RuleID RID = Range.start; RID < Range.end; ++RID) + Result.append(dumpRule(RID)).push_back('\n'); + return Result; +} + +std::string Grammar::dump() const { + std::string Result; + llvm::raw_string_ostream OS(Result); + OS << "Nonterminals:\n"; + for (SymbolID SID = 0; SID < T->Nonterminals.size(); ++SID) + OS << llvm::formatv(" {0} {1}\n", SID, symbolName(SID)); + OS << "Rules:\n"; + for (RuleID RID = 0; RID < T->Rules.size(); ++RID) + OS << llvm::formatv(" {0} {1}\n", RID, dumpRule(RID)); + return OS.str(); +} + +} // namespace pseudo +} // namespace syntax +} // namespace clang diff --git a/clang/lib/Tooling/Syntax/Pseudo/GrammarBNF.cpp b/clang/lib/Tooling/Syntax/Pseudo/GrammarBNF.cpp new file mode 100644 index 0000000..40181e0 --- /dev/null +++ b/clang/lib/Tooling/Syntax/Pseudo/GrammarBNF.cpp @@ -0,0 +1,260 @@ +//===--- GrammarBNF.cpp - build grammar from BNF files ----------*- C++-*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "clang/Basic/TokenKinds.h" +#include "clang/Tooling/Syntax/Pseudo/Grammar.h" +#include "llvm/ADT/DenseSet.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/Support/FormatVariadic.h" +#include + +namespace clang { +namespace syntax { +namespace pseudo { + +namespace { +static const llvm::StringRef OptSuffix = "_opt"; +static const llvm::StringRef StartSymbol = "_"; + +void initTerminals(std::vector &Out) { + Out.clear(); + Out.reserve(NumTerminals); + for (unsigned I = 0; I < NumTerminals; ++I) { + tok::TokenKind K = static_cast(I); + if (const auto *Punc = tok::getPunctuatorSpelling(K)) + Out.push_back(Punc); + else + Out.push_back(llvm::StringRef(tok::getTokenName(K)).upper()); + } +} +// Builds grammar from BNF files. +class GrammarBuilder { +public: + GrammarBuilder(std::vector &Diagnostics) + : Diagnostics(Diagnostics) {} + + std::unique_ptr build(llvm::StringRef BNF) { + auto Specs = eliminateOptional(parse(BNF)); + + assert(llvm::all_of(Specs, + [](const RuleSpec &R) { + if (R.Target.endswith(OptSuffix)) + return false; + return llvm::all_of( + R.Sequence, [](const RuleSpec::Element &E) { + return !E.Symbol.endswith(OptSuffix); + }); + }) && + "Optional symbols should be eliminated!"); + + auto T = std::make_unique(); + initTerminals(T->Terminals); + + // Assemble the name->ID and ID->nonterminal name maps. + llvm::DenseSet UniqueNonterminals; + llvm::DenseMap SymbolIds; + for (uint16_t I = 0; I < NumTerminals; ++I) + SymbolIds.try_emplace(T->Terminals[I], tokenSymbol(tok::TokenKind(I))); + auto Consider = [&](llvm::StringRef Name) { + if (!SymbolIds.count(Name)) + UniqueNonterminals.insert(Name); + }; + for (const auto &Spec : Specs) { + Consider(Spec.Target); + for (const RuleSpec::Element &Elt : Spec.Sequence) + Consider(Elt.Symbol); + } + llvm::for_each(UniqueNonterminals, [&T](llvm::StringRef Name) { + T->Nonterminals.emplace_back(); + T->Nonterminals.back().Name = Name.str(); + }); + assert(T->Nonterminals.size() < (1 << (SymbolBits - 1)) && + "Too many nonterminals to fit in SymbolID bits!"); + llvm::sort(T->Nonterminals, [](const GrammarTable::Nonterminal &L, + const GrammarTable::Nonterminal &R) { + return L.Name < R.Name; + }); + // Build name -> ID maps for nonterminals. + for (SymbolID SID = 0; SID < T->Nonterminals.size(); ++SID) + SymbolIds.try_emplace(T->Nonterminals[SID].Name, SID); + + // Convert the rules. + T->Rules.reserve(Specs.size()); + std::vector Symbols; + auto Lookup = [SymbolIds](llvm::StringRef Name) { + auto It = SymbolIds.find(Name); + assert(It != SymbolIds.end() && "Didn't find the symbol in SymbolIds!"); + return It->second; + }; + for (const auto &Spec : Specs) { + assert(Spec.Sequence.size() < Rule::MaxElements); + Symbols.clear(); + for (const RuleSpec::Element &Elt : Spec.Sequence) + Symbols.push_back(Lookup(Elt.Symbol)); + T->Rules.push_back(Rule(Lookup(Spec.Target), Symbols)); + } + assert(T->Rules.size() < (1 << RuleBits) && + "Too many rules to fit in RuleID bits!"); + llvm::sort(T->Rules, [](const Rule &Left, const Rule &Right) { + // Sorted by the Target. + return std::tie(Left.Target, Left.Size) < + std::tie(Right.Target, Right.Size); + }); + RuleID RulePos = 0; + for (SymbolID SID = 0; SID < T->Nonterminals.size(); ++SID) { + RuleID Start = RulePos; + while (RulePos < T->Rules.size() && T->Rules[RulePos].Target == SID) + ++RulePos; + T->Nonterminals[SID].RuleRange = {Start, RulePos}; + } + auto G = std::make_unique(std::move(T)); + diagnoseGrammar(*G); + return G; + } + +private: + // Text representation of a BNF grammar rule. + struct RuleSpec { + llvm::StringRef Target; + struct Element { + llvm::StringRef Symbol; // Name of the symbol + }; + std::vector Sequence; + + std::string toString() const { + std::vector Body; + for (const auto &E : Sequence) + Body.push_back(E.Symbol); + return llvm::formatv("{0} := {1}", Target, llvm::join(Body, " ")); + } + }; + + std::vector parse(llvm::StringRef Lines) { + std::vector Specs; + for (llvm::StringRef Line : llvm::split(Lines, '\n')) { + Line = Line.trim(); + // Strip anything coming after the '#' (comment). + Line = Line.take_while([](char C) { return C != '#'; }); + if (Line.empty()) + continue; + RuleSpec Rule; + if (parseLine(Line, Rule)) + Specs.push_back(std::move(Rule)); + } + return Specs; + } + + bool parseLine(llvm::StringRef Line, RuleSpec &Out) { + auto Parts = Line.split(":="); + if (Parts.first == Line) { // no separator in Line + Diagnostics.push_back( + llvm::formatv("Failed to parse '{0}': no separator :=", Line).str()); + return false; + } + + Out.Target = Parts.first.trim(); + Out.Sequence.clear(); + for (llvm::StringRef Chunk : llvm::split(Parts.second, ' ')) { + Chunk = Chunk.trim(); + if (Chunk.empty()) + continue; // skip empty + + Out.Sequence.push_back({Chunk}); + } + return true; + }; + + // Inlines all _opt symbols. + // For example, a rule E := id +_opt id, after elimination, we have two + // equivalent rules: + // 1) E := id + id + // 2) E := id id + std::vector eliminateOptional(llvm::ArrayRef Input) { + std::vector Results; + std::vector Storage; + for (const auto &R : Input) { + eliminateOptionalTail( + R.Sequence, Storage, [&Results, &Storage, &R, this]() { + if (Storage.empty()) { + Diagnostics.push_back( + llvm::formatv("Rule '{0}' has a nullable RHS", R.toString())); + return; + } + Results.push_back({R.Target, Storage}); + }); + assert(Storage.empty()); + } + return Results; + } + void eliminateOptionalTail(llvm::ArrayRef Elements, + std::vector &Result, + llvm::function_ref CB) { + if (Elements.empty()) + return CB(); + auto Front = Elements.front(); + if (!Front.Symbol.endswith(OptSuffix)) { + Result.push_back(std::move(Front)); + eliminateOptionalTail(Elements.drop_front(1), Result, CB); + Result.pop_back(); + return; + } + // Enumerate two options: skip the opt symbol, or inline the symbol. + eliminateOptionalTail(Elements.drop_front(1), Result, CB); // skip + Front.Symbol = Front.Symbol.drop_back(OptSuffix.size()); // drop "_opt" + Result.push_back(std::move(Front)); + eliminateOptionalTail(Elements.drop_front(1), Result, CB); + Result.pop_back(); + } + + // Diagnoses the grammar and emit warnings if any. + void diagnoseGrammar(const Grammar &G) { + const auto &T = G.table(); + for (SymbolID SID = 0; SID < T.Nonterminals.size(); ++SID) { + auto Range = T.Nonterminals[SID].RuleRange; + if (Range.start == Range.end) + Diagnostics.push_back( + llvm::formatv("No rules for nonterminal: {0}", G.symbolName(SID))); + llvm::StringRef NameRef = T.Nonterminals[SID].Name; + if (llvm::all_of(NameRef, llvm::isAlpha) && NameRef.upper() == NameRef) { + Diagnostics.push_back(llvm::formatv( + "Token-like name {0} is used as a nonterminal", G.symbolName(SID))); + } + } + for (RuleID RID = 0; RID + 1 < T.Rules.size(); ++RID) { + if (T.Rules[RID] == T.Rules[RID + 1]) + Diagnostics.push_back( + llvm::formatv("Duplicate rule: `{0}`", G.dumpRule(RID))); + // Warning for nullable non-terminals + if (T.Rules[RID].Size == 0) + Diagnostics.push_back( + llvm::formatv("Rule `{0}` has a nullable RHS", G.dumpRule(RID))); + } + // symbol-id -> used counts + std::vector UseCounts(T.Nonterminals.size(), 0); + for (const Rule &R : T.Rules) + for (SymbolID SID : R.seq()) + if (isNonterminal(SID)) + ++UseCounts[SID]; + for (SymbolID SID = 0; SID < UseCounts.size(); ++SID) + if (UseCounts[SID] == 0 && T.Nonterminals[SID].Name != StartSymbol) + Diagnostics.push_back( + llvm::formatv("Nonterminal never used: {0}", G.symbolName(SID))); + } + std::vector &Diagnostics; +}; +} // namespace + +std::unique_ptr +Grammar::parseBNF(llvm::StringRef BNF, std::vector &Diagnostics) { + Diagnostics.clear(); + return GrammarBuilder(Diagnostics).build(BNF); +} + +} // namespace pseudo +} // namespace syntax +} // namespace clang diff --git a/clang/unittests/Tooling/Syntax/CMakeLists.txt b/clang/unittests/Tooling/Syntax/CMakeLists.txt index 174f3e7..a21d558 100644 --- a/clang/unittests/Tooling/Syntax/CMakeLists.txt +++ b/clang/unittests/Tooling/Syntax/CMakeLists.txt @@ -28,3 +28,5 @@ target_link_libraries(SyntaxTests PRIVATE LLVMTestingSupport ) + +add_subdirectory(Pseudo) diff --git a/clang/unittests/Tooling/Syntax/Pseudo/CMakeLists.txt b/clang/unittests/Tooling/Syntax/Pseudo/CMakeLists.txt new file mode 100644 index 0000000..77c6cba --- /dev/null +++ b/clang/unittests/Tooling/Syntax/Pseudo/CMakeLists.txt @@ -0,0 +1,20 @@ +set(LLVM_LINK_COMPONENTS + Support + ) + +add_clang_unittest(ClangPseudoTests + GrammarTests.cpp +) + +clang_target_link_libraries(ClangPseudoTests + PRIVATE + clangBasic + clangLex + clangSyntaxPseudo + clangTesting + ) + +target_link_libraries(ClangPseudoTests + PRIVATE + LLVMTestingSupport + ) diff --git a/clang/unittests/Tooling/Syntax/Pseudo/GrammarTests.cpp b/clang/unittests/Tooling/Syntax/Pseudo/GrammarTests.cpp new file mode 100644 index 0000000..7954e53 --- /dev/null +++ b/clang/unittests/Tooling/Syntax/Pseudo/GrammarTests.cpp @@ -0,0 +1,102 @@ +//===--- GrammarTests.cpp - grammar tests ----------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "clang/Tooling/Syntax/Pseudo/Grammar.h" +#include "gmock/gmock.h" +#include "gtest/gtest.h" +#include + +namespace clang { +namespace syntax { +namespace pseudo { +namespace { + +using testing::AllOf; +using testing::ElementsAre; +using testing::IsEmpty; +using testing::UnorderedElementsAre; + +MATCHER_P(TargetID, SID, "") { return arg.Target == SID; } +template testing::Matcher Sequence(T... IDs) { + return testing::Property(&Rule::seq, ElementsAre(IDs...)); +} + +class GrammarTest : public ::testing::Test { +public: + void build(llvm::StringRef BNF) { + Diags.clear(); + G = Grammar::parseBNF(BNF, Diags); + } + + SymbolID lookup(llvm::StringRef Name) const { + for (unsigned I = 0; I < NumTerminals; ++I) + if (G->table().Terminals[I] == Name) + return tokenSymbol(static_cast(I)); + for (SymbolID ID = 0; ID < G->table().Nonterminals.size(); ++ID) + if (G->table().Nonterminals[ID].Name == Name) + return ID; + ADD_FAILURE() << "No such symbol found: " << Name; + return 0; + } + +protected: + std::unique_ptr G; + std::vector Diags; +}; + +TEST_F(GrammarTest, Basic) { + build("expression := IDENTIFIER + expression # comment"); + EXPECT_THAT(Diags, IsEmpty()); + + auto ExpectedRule = + AllOf(TargetID(lookup("expression")), + Sequence(lookup("IDENTIFIER"), lookup("+"), lookup("expression"))); + auto ExpressionID = lookup("expression"); + EXPECT_EQ(G->symbolName(ExpressionID), "expression"); + EXPECT_THAT(G->rulesFor(ExpressionID), UnorderedElementsAre(ExpectedRule)); + const auto &Rule = G->lookupRule(/*RID=*/0); + EXPECT_THAT(Rule, ExpectedRule); + EXPECT_THAT(G->symbolName(Rule.seq()[0]), "IDENTIFIER"); + EXPECT_THAT(G->symbolName(Rule.seq()[1]), "+"); + EXPECT_THAT(G->symbolName(Rule.seq()[2]), "expression"); +} + +TEST_F(GrammarTest, EliminatedOptional) { + build("_ := CONST_opt INT ;_opt"); + EXPECT_THAT(Diags, IsEmpty()); + EXPECT_THAT(G->table().Rules, + UnorderedElementsAre( + Sequence(lookup("INT")), + Sequence(lookup("CONST"), lookup("INT")), + Sequence(lookup("CONST"), lookup("INT"), lookup(";")), + Sequence(lookup("INT"), lookup(";")))); +} + +TEST_F(GrammarTest, Diagnostics) { + build(R"cpp( + _ := ,_opt + _ := undefined-sym + null := + _ := IDENFIFIE # a typo of the terminal IDENFITIER + + invalid + )cpp"); + + EXPECT_THAT(Diags, UnorderedElementsAre( + "Rule '_ := ,_opt' has a nullable RHS", + "Rule 'null := ' has a nullable RHS", + "No rules for nonterminal: undefined-sym", + "Failed to parse 'invalid': no separator :=", + "Token-like name IDENFIFIE is used as a nonterminal", + "No rules for nonterminal: IDENFIFIE")); +} + +} // namespace +} // namespace pseudo +} // namespace syntax +} // namespace clang