add_subdirectory(clang-move)
add_subdirectory(clang-query)
add_subdirectory(pp-trace)
+add_subdirectory(pseudo)
add_subdirectory(tool-template)
# Add the common testsuite after all the tools.
--- /dev/null
+include_directories(include)
+include_directories(${CMAKE_CURRENT_BINARY_DIR}/include)
+add_subdirectory(lib)
+add_subdirectory(tool)
+add_subdirectory(unittests)
+add_subdirectory(test)
--- /dev/null
+# clang pseudoparser
+
+This directory implements an approximate heuristic parser for C++, based on the
+clang lexer, the C++ grammar, and the GLR parsing algorithm.
+
+It parses a file in isolation, without reading its included headers.
+The result is a strict syntactic tree whose structure follows the C++ grammar.
+There is no semantic analysis, apart from guesses to disambiguate the parse.
+Disambiguation can optionally be guided by an AST or a symbol index.
+
+For now, the best reference on intended scope is the [design proposal],
+with further discussion on the [RFC].
+
+## Dependencies between pseudoparser and clang
+
+Dependencies are limited because they don't make sense, but also to avoid
+placing a burden on clang mantainers.
+
+The pseudoparser reuses the clang lexer (clangLex and clangBasic libraries) but
+not the higher-level libraries (Parse, Sema, AST, Frontend...).
+
+When the pseudoparser should be used together with an AST (e.g. to guide
+disambiguation), this is a separate "bridge" library that depends on both.
+
+Clang does not depend on the pseudoparser at all. If this seems useful in future
+it should be discussed by RFC.
+
+## Parity between pseudoparser and clang
+
+The pseudoparser aims to understand real-world code, and particularly the
+languages and extensions supported by Clang.
+
+However we don't try to keep these in lockstep: there's no expectation that
+Clang parser changes are accompanied by pseudoparser changes or vice versa.
+
+[design proposal]: https://docs.google.com/document/d/1eGkTOsFja63wsv8v0vd5JdoTonj-NlN3ujGF0T7xDbM/edit
+[RFC]: https://discourse.llvm.org/t/rfc-a-c-pseudo-parser-for-tooling/59217/49
--- /dev/null
+//===--- DirectiveMap.h - Find and strip preprocessor directives -*- C++-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// The pseudoparser tries to match a token stream to the C++ grammar.
+// Preprocessor #defines and other directives are not part of this grammar, and
+// should be removed before the file can be parsed.
+//
+// Conditional blocks like #if...#else...#endif are particularly tricky, as
+// simply stripping the directives may not produce a grammatical result:
+//
+// return
+// #ifndef DEBUG
+// 1
+// #else
+// 0
+// #endif
+// ;
+//
+// This header supports analyzing and removing the directives in a source file.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef CLANG_PSEUDO_DIRECTIVEMAP_H
+#define CLANG_PSEUDO_DIRECTIVEMAP_H
+
+#include "clang-pseudo/Token.h"
+#include "clang/Basic/TokenKinds.h"
+#include <vector>
+
+namespace clang {
+class LangOptions;
+namespace pseudo {
+
+/// Describes the structure of a source file, as seen by the preprocessor.
+///
+/// The structure is a tree, whose leaves are plain source code and directives,
+/// and whose internal nodes are #if...#endif sections.
+///
+/// (root)
+/// |-+ Directive #include <stdio.h>
+/// |-+ Code int main() {
+/// | ` printf("hello, ");
+/// |-+ Conditional -+ Directive #ifndef NDEBUG
+/// | |-+ Code printf("debug\n");
+/// | |-+ Directive #else
+/// | |-+ Code printf("production\n");
+/// | `-+ Directive #endif
+/// |-+ Code return 0;
+/// ` }
+///
+/// Unlike the clang preprocessor, we model the full tree explicitly.
+/// This class does not recognize macro usage, only directives.
+struct DirectiveMap {
+ /// A range of code (and possibly comments) containing no directives.
+ struct Code {
+ Token::Range Tokens;
+ };
+ /// A preprocessor directive.
+ struct Directive {
+ /// Raw tokens making up the directive, starting with `#`.
+ Token::Range Tokens;
+ clang::tok::PPKeywordKind Kind = clang::tok::pp_not_keyword;
+ };
+ /// A preprocessor conditional section.
+ ///
+ /// This starts with an #if, #ifdef, #ifndef etc directive.
+ /// It covers all #else branches, and spans until the matching #endif.
+ struct Conditional {
+ /// The sequence of directives that introduce top-level alternative parses.
+ ///
+ /// The first branch will have an #if type directive.
+ /// Subsequent branches will have #else type directives.
+ std::vector<std::pair<Directive, DirectiveMap>> Branches;
+ /// The directive terminating the conditional, should be #endif.
+ Directive End;
+ };
+
+ /// Some piece of the file. {One of Code, Directive, Conditional}.
+ class Chunk; // Defined below.
+ std::vector<Chunk> Chunks;
+
+ /// Extract preprocessor structure by examining the raw tokens.
+ static DirectiveMap parse(const TokenStream &);
+
+ // FIXME: add heuristically selection of conditional branches.
+ // FIXME: allow deriving a preprocessed stream
+};
+llvm::raw_ostream &operator<<(llvm::raw_ostream &, const DirectiveMap &);
+llvm::raw_ostream &operator<<(llvm::raw_ostream &, const DirectiveMap::Chunk &);
+llvm::raw_ostream &operator<<(llvm::raw_ostream &, const DirectiveMap::Code &);
+llvm::raw_ostream &operator<<(llvm::raw_ostream &,
+ const DirectiveMap::Directive &);
+llvm::raw_ostream &operator<<(llvm::raw_ostream &,
+ const DirectiveMap::Conditional &);
+
+// FIXME: This approximates std::variant<Code, Directive, Conditional>.
+// Switch once we can use C++17.
+class DirectiveMap::Chunk {
+public:
+ enum Kind { K_Empty, K_Code, K_Directive, K_Conditional };
+ Kind kind() const {
+ return CodeVariant ? K_Code
+ : DirectiveVariant ? K_Directive
+ : ConditionalVariant ? K_Conditional
+ : K_Empty;
+ }
+
+ Chunk() = delete;
+ Chunk(const Chunk &) = delete;
+ Chunk(Chunk &&) = default;
+ Chunk &operator=(const Chunk &) = delete;
+ Chunk &operator=(Chunk &&) = default;
+ ~Chunk() = default;
+
+ // T => Chunk constructor.
+ Chunk(Code C) : CodeVariant(std::move(C)) {}
+ Chunk(Directive C) : DirectiveVariant(std::move(C)) {}
+ Chunk(Conditional C) : ConditionalVariant(std::move(C)) {}
+
+ // Chunk => T& and const T& conversions.
+#define CONVERSION(CONST, V) \
+ explicit operator CONST V &() CONST { return *V##Variant; }
+ CONVERSION(const, Code);
+ CONVERSION(, Code);
+ CONVERSION(const, Directive);
+ CONVERSION(, Directive);
+ CONVERSION(const, Conditional);
+ CONVERSION(, Conditional);
+#undef CONVERSION
+
+private:
+ // Wasteful, a union variant would be better!
+ llvm::Optional<Code> CodeVariant;
+ llvm::Optional<Directive> DirectiveVariant;
+ llvm::Optional<Conditional> ConditionalVariant;
+};
+
+} // namespace pseudo
+} // namespace clang
+
+#endif
--- /dev/null
+//===--- Grammar.h - grammar used by clang pseudo parser --------*- C++-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines base structures for parsing & modeling a grammar for a
+// programming language:
+//
+// # This is a fake C++ BNF grammar
+// _ := translation-unit
+// translation-unit := declaration-seq_opt
+// declaration-seq := declaration
+// declaration-seq := declaration-seq declaration
+//
+// A grammar formally describes a language, and it is constructed by a set of
+// production rules. A rule is of BNF form (AAA := BBB CCC). A symbol is either
+// non-terminal or terminal, identified by a SymbolID.
+//
+// Notions about the BNF grammar:
+// - "_" is the start symbol of the augmented grammar;
+// - single-line comment is supported, starting with a #
+// - A rule describes how a nonterminal (left side of :=) is constructed, and
+// it is *per line* in the grammar file
+// - Terminals (also called tokens) correspond to the clang::TokenKind; they
+// are written in the grammar like "IDENTIFIER", "USING", "+"
+// - Nonterminals are specified with "lower-case" names in the grammar; they
+// shouldn't be nullable (has an empty sequence)
+// - optional symbols are supported (specified with a _opt suffix), and they
+// will be eliminated during the grammar parsing stage
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef CLANG_PSEUDO_GRAMMAR_H
+#define CLANG_PSEUDO_GRAMMAR_H
+
+#include "clang/Basic/TokenKinds.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/StringRef.h"
+#include <cstdint>
+#include <vector>
+
+namespace clang {
+namespace pseudo {
+// A SymbolID uniquely identifies a terminal/non-terminal symbol in a grammar.
+// Non-terminal IDs are indexes into a table of non-terminal symbols.
+// Terminal IDs correspond to the clang TokenKind enum.
+using SymbolID = uint16_t;
+// SymbolID is only 12 bits wide.
+// There are maximum 2^11 terminals (aka tokens) and 2^11 nonterminals.
+static constexpr uint16_t SymbolBits = 12;
+static constexpr uint16_t NumTerminals = tok::NUM_TOKENS;
+// SymbolIDs with the top bit set are tokens/terminals.
+static constexpr SymbolID TokenFlag = 1 << (SymbolBits - 1);
+inline bool isToken(SymbolID ID) { return ID & TokenFlag; }
+inline bool isNonterminal(SymbolID ID) { return !isToken(ID); }
+// The terminals are always the clang tok::TokenKind (not all are used).
+inline tok::TokenKind symbolToToken(SymbolID SID) {
+ assert(isToken(SID));
+ SID &= ~TokenFlag;
+ assert(SID < NumTerminals);
+ return static_cast<tok::TokenKind>(SID);
+}
+inline SymbolID tokenSymbol(tok::TokenKind TK) {
+ return TokenFlag | static_cast<SymbolID>(TK);
+}
+
+// A RuleID uniquely identifies a production rule in a grammar.
+// It is an index into a table of rules.
+using RuleID = uint16_t;
+// There are maximum 2^12 rules.
+static constexpr unsigned RuleBits = 12;
+
+// Represent a production rule in the grammar, e.g.
+// expression := a b c
+// ^Target ^Sequence
+struct Rule {
+ Rule(SymbolID Target, llvm::ArrayRef<SymbolID> Seq);
+
+ // We occupy 4 bits for the sequence, in theory, it can be at most 2^4 tokens
+ // long, however, we're stricter in order to reduce the size, we limit the max
+ // length to 9 (this is the longest sequence in cxx grammar).
+ static constexpr unsigned SizeBits = 4;
+ static constexpr unsigned MaxElements = 9;
+ static_assert(MaxElements <= (1 << SizeBits), "Exceeds the maximum limit");
+ static_assert(SizeBits + SymbolBits <= 16,
+ "Must be able to store symbol ID + size efficiently");
+
+ // 16 bits for target symbol and size of sequence:
+ // SymbolID : 12 | Size : 4
+ SymbolID Target : SymbolBits;
+ uint8_t Size : SizeBits; // Size of the Sequence
+ SymbolID Sequence[MaxElements];
+
+ llvm::ArrayRef<SymbolID> seq() const {
+ return llvm::ArrayRef<SymbolID>(Sequence, Size);
+ }
+ friend bool operator==(const Rule &L, const Rule &R) {
+ return L.Target == R.Target && L.seq() == R.seq();
+ }
+};
+
+struct GrammarTable;
+
+// Grammar that describes a programming language, e.g. C++. It represents the
+// contents of the specified grammar.
+// It is a building block for constructing a table-based parser.
+class Grammar {
+public:
+ explicit Grammar(std::unique_ptr<GrammarTable>);
+
+ // Parses grammar from a BNF file.
+ // Diagnostics emitted during parsing are stored in Diags.
+ static std::unique_ptr<Grammar> parseBNF(llvm::StringRef BNF,
+ std::vector<std::string> &Diags);
+
+ // Returns the SymbolID of the start symbol '_'.
+ SymbolID startSymbol() const { return StartSymbol; };
+
+ // Returns all rules of the given non-terminal symbol.
+ llvm::ArrayRef<Rule> rulesFor(SymbolID SID) const;
+ const Rule &lookupRule(RuleID RID) const;
+
+ // Gets symbol (terminal or non-terminal) name.
+ // Terminals have names like "," (kw_comma) or "OPERATOR" (kw_operator).
+ llvm::StringRef symbolName(SymbolID) const;
+
+ // Dumps the whole grammar.
+ std::string dump() const;
+ // Dumps a particular rule.
+ std::string dumpRule(RuleID) const;
+ // Dumps all rules of the given nonterminal symbol.
+ std::string dumpRules(SymbolID) const;
+
+ const GrammarTable &table() const { return *T; }
+
+private:
+ std::unique_ptr<GrammarTable> T;
+ // The start symbol '_' of the augmented grammar.
+ SymbolID StartSymbol;
+};
+// For each nonterminal X, computes the set of terminals that begin strings
+// derived from X. (Known as FIRST sets in grammar-based parsers).
+std::vector<llvm::DenseSet<SymbolID>> firstSets(const Grammar &);
+// For each nonterminal X, computes the set of terminals that could immediately
+// follow X. (Known as FOLLOW sets in grammar-based parsers).
+std::vector<llvm::DenseSet<SymbolID>> followSets(const Grammar &);
+
+// Storage for the underlying data of the Grammar.
+// It can be constructed dynamically (from compiling BNF file) or statically
+// (a compiled data-source).
+struct GrammarTable {
+ GrammarTable();
+
+ struct Nonterminal {
+ std::string Name;
+ // Corresponding rules that construct the non-terminal, it is a [start, end)
+ // index range of the Rules table.
+ struct {
+ RuleID start;
+ RuleID end;
+ } RuleRange;
+ };
+
+ // The rules are sorted (and thus grouped) by target symbol.
+ // RuleID is the index of the vector.
+ std::vector<Rule> Rules;
+ // A table of terminals (aka tokens). It corresponds to the clang::Token.
+ // clang::tok::TokenKind is the index of the table.
+ llvm::ArrayRef<std::string> Terminals;
+ // A table of nonterminals, sorted by name.
+ // SymbolID is the index of the table.
+ std::vector<Nonterminal> Nonterminals;
+};
+
+} // namespace pseudo
+} // namespace clang
+
+#endif // LLVM_CLANG_TOOLING_SYNTAX_GRAMMAR_H
--- /dev/null
+//===--- LRGraph.h - Build an LR automaton ------------------*- C++-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// LR parsers are bottom-up parsers -- they scan the input from left to right,
+// and collect the right-hand side of a production rule (called handle) on top
+// of the stack, then replace (reduce) the handle with the nonterminal defined
+// by the production rule.
+//
+// This file defines LRGraph, a deterministic handle-finding finite-state
+// automaton, which is a key component in LR parsers to recognize any of
+// handles in the grammar efficiently. We build the LR table (ACTION and GOTO
+// Table) based on the LRGraph.
+//
+// LRGraph can be constructed for any context-free grammars.
+// Even for a LR-ambiguous grammar, we can construct a deterministic FSA, but
+// interpretation of the FSA is nondeterministic -- we might in a state where
+// we can continue searching an handle and identify a handle (called
+// shift/reduce conflicts), or identify more than one handle (callled
+// reduce/reduce conflicts).
+//
+// LRGraph is a common model for all variants of LR automatons, from the most
+// basic one LR(0), the powerful SLR(1), LR(1) which uses a one-token lookahead
+// in making decisions.
+//===----------------------------------------------------------------------===//
+
+#ifndef CLANG_PSEUDO_LRGRAPH_H
+#define CLANG_PSEUDO_LRGRAPH_H
+
+#include "clang-pseudo/Grammar.h"
+#include "llvm/ADT/Hashing.h"
+#include <vector>
+
+namespace clang {
+namespace pseudo {
+
+// An LR item -- a grammar rule with a dot at some position of the body.
+// e.g. a production rule A := X Y yields 3 items:
+// A := . X Y
+// A := X . Y
+// A := X Y .
+// An item indicates how much of a production rule has been recognized at a
+// position (described by dot), for example, A := X . Y indicates that we have
+// recognized the X part from the input, and we hope next to see the input
+// derivable from Y.
+class Item {
+public:
+ static Item start(RuleID ID, const Grammar &G) {
+ Item I;
+ I.RID = ID;
+ I.RuleLength = G.lookupRule(ID).Size;
+ return I;
+ }
+ static Item sentinel(RuleID ID) {
+ Item I;
+ I.RID = ID;
+ return I;
+ }
+
+ RuleID rule() const { return RID; }
+ uint8_t dot() const { return DotPos; }
+
+ bool hasNext() const { return DotPos < RuleLength; }
+ SymbolID next(const Grammar &G) const {
+ assert(hasNext());
+ return G.lookupRule(RID).Sequence[DotPos];
+ }
+
+ Item advance() const {
+ assert(hasNext());
+ Item I = *this;
+ ++I.DotPos;
+ return I;
+ }
+
+ std::string dump(const Grammar &G) const;
+
+ bool operator==(const Item &I) const {
+ return DotPos == I.DotPos && RID == I.RID;
+ }
+ bool operator<(const Item &I) const {
+ return std::tie(RID, DotPos) < std::tie(I.RID, I.DotPos);
+ }
+ friend llvm::hash_code hash_value(const Item &I) {
+ return llvm::hash_combine(I.RID, I.DotPos);
+ }
+
+private:
+ RuleID RID = 0;
+ uint8_t DotPos = 0;
+ uint8_t RuleLength = 0; // the length of rule body.
+};
+
+// A state represents a node in the LR automaton graph. It is an item set, which
+// contains all possible rules that the LR parser may be parsing in that state.
+//
+// Conceptually, If we knew in advance what we're parsing, at any point we're
+// partway through parsing a production, sitting on a stack of partially parsed
+// productions. But because we don't know, there could be *several* productions
+// we're partway through. The set of possibilities is the parser state, and we
+// precompute all the transitions between these states.
+struct State {
+ // A full set of items (including non-kernel items) representing the state,
+ // in a canonical order (see SortByNextSymbol in the cpp file).
+ std::vector<Item> Items;
+
+ std::string dump(const Grammar &G, unsigned Indent = 0) const;
+};
+
+// LRGraph is a deterministic finite state automaton for LR parsing.
+//
+// Intuitively, an LR automaton is a transition graph. The graph has a
+// collection of nodes, called States. Each state corresponds to a particular
+// item set, which represents a condition that could occur during the process of
+// parsing a production. Edges are directed from one state to another. Each edge
+// is labeled by a grammar symbol (terminal or nonterminal).
+//
+// LRGraph is used to construct the LR parsing table which is a core
+// data-structure driving the LR parser.
+class LRGraph {
+public:
+ // StateID is the index in States table.
+ using StateID = uint16_t;
+
+ // Constructs an LR(0) automaton.
+ static LRGraph buildLR0(const Grammar &);
+
+ // An edge in the LR graph, it represents a transition in the LR automaton.
+ // If the parser is at state Src, with a lookahead Label, then it
+ // transits to state Dst.
+ struct Edge {
+ StateID Src, Dst;
+ SymbolID Label;
+ };
+
+ llvm::ArrayRef<State> states() const { return States; }
+ llvm::ArrayRef<Edge> edges() const { return Edges; }
+
+ std::string dumpForTests(const Grammar &) const;
+
+private:
+ LRGraph(std::vector<State> States, std::vector<Edge> Edges)
+ : States(std::move(States)), Edges(std::move(Edges)) {}
+
+ std::vector<State> States;
+ std::vector<Edge> Edges;
+};
+
+} // namespace pseudo
+} // namespace clang
+
+namespace llvm {
+// Support clang::pseudo::Item as DenseMap keys.
+template <> struct DenseMapInfo<clang::pseudo::Item> {
+ static inline clang::pseudo::Item getEmptyKey() {
+ return clang::pseudo::Item::sentinel(-1);
+ }
+ static inline clang::pseudo::Item getTombstoneKey() {
+ return clang::pseudo::Item::sentinel(-2);
+ }
+ static unsigned getHashValue(const clang::pseudo::Item &I) {
+ return hash_value(I);
+ }
+ static bool isEqual(const clang::pseudo::Item &LHS,
+ const clang::pseudo::Item &RHS) {
+ return LHS == RHS;
+ }
+};
+} // namespace llvm
+
+#endif // LLVM_CLANG_TOOLING_SYNTAX_PSEUDO_LRGRAPH_H
--- /dev/null
+//===--- LRTable.h - Define LR Parsing Table ---------------------*- C++-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// The LRTable (referred as LR parsing table in the LR literature) is the core
+// component in LR parsers, it drives the LR parsers by specifying an action to
+// take given the current state on the top of the stack and the current
+// lookahead token.
+//
+// The LRTable can be described as a matrix where the rows represent
+// the states of the LR graph, the columns represent the symbols of the
+// grammar, and each entry of the matrix (called action) represents a
+// state transition in the graph.
+//
+// Typically, based on the category of the grammar symbol, the LRTable is
+// broken into two logically separate tables:
+// - ACTION table with terminals as columns -- e.g ACTION[S, a] specifies
+// next action (shift/reduce/accept/error) on state S under a lookahead
+// terminal a
+// - GOTO table with nonterminals as columns -- e.g. GOTO[S, X] specify
+// the state which we transist to from the state S with the nonterminal X
+//
+// LRTable is *performance-critial* as it is consulted frequently during a
+// parse. In general, LRTable is very sparse (most of the entries are empty).
+// For example, for the C++ language, the SLR table has ~1500 states and 650
+// symbols which results in a matrix having 975K entries, ~90% of entries are
+// empty.
+//
+// This file implements a speed-and-space-efficient LRTable.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef CLANG_PSEUDO_LRTABLE_H
+#define CLANG_PSEUDO_LRTABLE_H
+
+#include "clang-pseudo/Grammar.h"
+#include "llvm/ADT/ArrayRef.h"
+#include <cstdint>
+#include <vector>
+
+namespace clang {
+namespace pseudo {
+
+// Represents the LR parsing table, which can efficiently the question "what is
+// the next step given the lookahead token and current state on top of the
+// stack?".
+//
+// This is a dense implementation, which only takes an amount of space that is
+// proportional to the number of non-empty entries in the table.
+//
+// Unlike the typical LR parsing table which allows at most one available action
+// per entry, conflicted actions are allowed in LRTable. The LRTable is designed
+// to be used in nondeterministic LR parsers (e.g. GLR).
+class LRTable {
+public:
+ // StateID is only 13 bits wide.
+ using StateID = uint16_t;
+ static constexpr unsigned StateBits = 13;
+
+ // Action represents the terminal and nonterminal actions, it combines the
+ // entry of the ACTION and GOTO tables from the LR literature.
+ class Action {
+ public:
+ enum Kind : uint8_t {
+ Sentinel = 0,
+ // Terminal actions, corresponding to entries of ACTION table.
+
+ // Shift to state n: move forward with the lookahead, and push state n
+ // onto the state stack.
+ // A shift is a forward transition, and the value n is the next state that
+ // the parser is to enter.
+ Shift,
+ // Reduce by a rule: pop the state stack.
+ Reduce,
+ // Signals that we have parsed the input successfully.
+ Accept,
+
+ // Nonterminal actions, corresponding to entry of GOTO table.
+
+ // Go to state n: push state n onto the state stack.
+ // Similar to Shift, but it is a nonterminal forward transition.
+ GoTo,
+ };
+
+ static Action accept(RuleID RID) { return Action(Accept, RID); }
+ static Action goTo(StateID S) { return Action(GoTo, S); }
+ static Action shift(StateID S) { return Action(Shift, S); }
+ static Action reduce(RuleID RID) { return Action(Reduce, RID); }
+ static Action sentinel() { return Action(Sentinel, 0); }
+
+ StateID getShiftState() const {
+ assert(kind() == Shift);
+ return Value;
+ }
+ StateID getGoToState() const {
+ assert(kind() == GoTo);
+ return Value;
+ }
+ RuleID getReduceRule() const {
+ assert(kind() == Reduce);
+ return Value;
+ }
+ Kind kind() const { return static_cast<Kind>(K); }
+
+ bool operator==(const Action &L) const { return opaque() == L.opaque(); }
+ uint16_t opaque() const { return K << ValueBits | Value; };
+
+ private:
+ Action(Kind K1, unsigned Value) : K(K1), Value(Value) {}
+ static constexpr unsigned ValueBits = StateBits;
+ static constexpr unsigned KindBits = 3;
+ static_assert(ValueBits >= RuleBits, "Value must be able to store RuleID");
+ static_assert(KindBits + ValueBits <= 16,
+ "Must be able to store kind and value efficiently");
+ uint16_t K : KindBits;
+ // Either StateID or RuleID, depending on the Kind.
+ uint16_t Value : ValueBits;
+ };
+
+ // Returns all available actions for the given state on a terminal.
+ // Expected to be called by LR parsers.
+ llvm::ArrayRef<Action> getActions(StateID State, SymbolID Terminal) const;
+ // Returns the state after we reduce a nonterminal.
+ // Expected to be called by LR parsers.
+ StateID getGoToState(StateID State, SymbolID Nonterminal) const;
+
+ // Looks up available actions.
+ // Returns empty if no available actions in the table.
+ llvm::ArrayRef<Action> find(StateID State, SymbolID Symbol) const;
+
+ size_t bytes() const {
+ return sizeof(*this) + Actions.capacity() * sizeof(Action) +
+ States.capacity() * sizeof(StateID) +
+ NontermOffset.capacity() * sizeof(uint32_t) +
+ TerminalOffset.capacity() * sizeof(uint32_t);
+ }
+
+ std::string dumpStatistics() const;
+ std::string dumpForTests(const Grammar &G) const;
+
+ // Build a SLR(1) parsing table.
+ static LRTable buildSLR(const Grammar &G);
+
+ class Builder;
+ // Represents an entry in the table, used for building the LRTable.
+ struct Entry {
+ StateID State;
+ SymbolID Symbol;
+ Action Act;
+ };
+ // Build a specifid table for testing purposes.
+ static LRTable buildForTests(const GrammarTable &, llvm::ArrayRef<Entry>);
+
+private:
+ // Conceptually the LR table is a multimap from (State, SymbolID) => Action.
+ // Our physical representation is quite different for compactness.
+
+ // Index is nonterminal SymbolID, value is the offset into States/Actions
+ // where the entries for this nonterminal begin.
+ // Give a non-terminal id, the corresponding half-open range of StateIdx is
+ // [NontermIdx[id], NontermIdx[id+1]).
+ std::vector<uint32_t> NontermOffset;
+ // Similar to NontermOffset, but for terminals, index is tok::TokenKind.
+ std::vector<uint32_t> TerminalOffset;
+ // Parallel to Actions, the value is State (rows of the matrix).
+ // Grouped by the SymbolID, and only subranges are sorted.
+ std::vector<StateID> States;
+ // A flat list of available actions, sorted by (SymbolID, State).
+ std::vector<Action> Actions;
+};
+llvm::raw_ostream &operator<<(llvm::raw_ostream &, const LRTable::Action &);
+
+} // namespace pseudo
+} // namespace clang
+
+#endif // LLVM_CLANG_TOOLING_SYNTAX_PSEUDO_LRTABLE_H
--- /dev/null
+//===--- Token.h - Tokens and token streams in the pseudoparser --*- C++-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Tokens are the first level of abstraction above bytes used in pseudoparsing.
+// We use clang's lexer to scan the bytes (in raw mode, with no preprocessor).
+// The tokens is wrapped into pseudo::Token, along with line/indent info.
+//
+// Unlike clang, we make multiple passes over the whole file, out-of-order.
+// Therefore we retain the whole token sequence in memory. (This is feasible as
+// we process one file at a time). pseudo::TokenStream holds such a stream.
+// The initial stream holds the raw tokens read from the file, later passes
+// operate on derived TokenStreams (e.g. with directives stripped).
+//
+// Similar facilities from clang that are *not* used:
+// - SourceManager: designed around multiple files and precise macro expansion.
+// - clang::Token: coupled to SourceManager, doesn't retain layout info.
+// (pseudo::Token is similar, but without SourceLocations).
+// - syntax::TokenBuffer: coupled to SourceManager, has #includes and macros.
+// (pseudo::TokenStream is similar, but a flat token list).
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef CLANG_PSEUDO_TOKEN_H
+#define CLANG_PSEUDO_TOKEN_H
+
+#include "clang/Basic/LLVM.h"
+#include "clang/Basic/TokenKinds.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cstdint>
+#include <limits>
+#include <memory>
+#include <vector>
+
+namespace clang {
+class LangOptions;
+namespace pseudo {
+
+/// A single C++ or preprocessor token.
+///
+/// Unlike clang::Token and syntax::Token, these tokens are not connected to a
+/// SourceManager - we are not dealing with multiple files.
+struct Token {
+ /// An Index identifies a token within a stream.
+ using Index = uint32_t;
+ /// A sentinel Index indicating no token.
+ constexpr static Index Invalid = std::numeric_limits<Index>::max();
+ struct Range;
+
+ /// The token text.
+ ///
+ /// Typically from the original source file, but may have been synthesized.
+ StringRef text() const { return StringRef(Data, Length); }
+ const char *Data = nullptr;
+ uint32_t Length = 0;
+
+ /// Zero-based line number for the start of the token.
+ /// This refers to the original source file as written.
+ uint32_t Line = 0;
+ /// Width of whitespace before the first token on this line.
+ uint8_t Indent = 0;
+ /// Flags have some meaning defined by the function that produced this stream.
+ uint8_t Flags = 0;
+ // Helpers to get/set Flags based on `enum class`.
+ template <class T> bool flag(T Mask) const {
+ return Flags & uint8_t{static_cast<std::underlying_type_t<T>>(Mask)};
+ }
+ template <class T> void setFlag(T Mask) {
+ Flags |= uint8_t{static_cast<std::underlying_type_t<T>>(Mask)};
+ }
+
+ /// The type of token as determined by clang's lexer.
+ clang::tok::TokenKind Kind = clang::tok::unknown;
+};
+static_assert(sizeof(Token) <= sizeof(char *) + 16, "Careful with layout!");
+llvm::raw_ostream &operator<<(llvm::raw_ostream &, const Token &);
+
+/// A half-open range of tokens within a stream.
+struct Token::Range {
+ Index Begin = 0;
+ Index End = 0;
+
+ uint32_t size() const { return End - Begin; }
+ static Range emptyAt(Index Index) { return Range{Index, Index}; }
+};
+llvm::raw_ostream &operator<<(llvm::raw_ostream &, const Token::Range &);
+
+/// A complete sequence of Tokens representing a source file.
+///
+/// This may match a raw file from disk, or be derived from a previous stream.
+/// For example, stripping comments from a TokenStream results in a new stream.
+///
+/// A stream has sentinel 'eof' tokens at each end, e.g `int main();` becomes:
+/// int main ( ) ;
+/// eof kw_int ident l_paren r_paren semi eof
+/// front() back()
+/// 0 1 2 3 4 5
+class TokenStream {
+public:
+ /// Create an empty stream.
+ ///
+ /// Initially, the stream is appendable and not finalized.
+ /// The token sequence may only be accessed after finalize() is called.
+ ///
+ /// Payload is an opaque object which will be owned by the stream.
+ /// e.g. an allocator to hold backing storage for synthesized token text.
+ explicit TokenStream(std::shared_ptr<void> Payload = nullptr);
+
+ /// Append a token to the stream, which must not be finalized.
+ void push(Token T) {
+ assert(!isFinalized());
+ Storage.push_back(std::move(T));
+ }
+
+ /// Finalize the token stream, allowing tokens to be accessed.
+ /// Tokens may no longer be appended.
+ void finalize();
+ bool isFinalized() const;
+
+ /// Returns the index of T within the stream.
+ ///
+ /// T must be within the stream or the end sentinel (not the start sentinel).
+ Token::Index index(const Token &T) const {
+ assert(isFinalized());
+ assert(&T >= Storage.data() && &T < Storage.data() + Storage.size());
+ assert(&T != Storage.data() && "start sentinel");
+ return &T - Tokens.data();
+ }
+
+ ArrayRef<Token> tokens() const {
+ assert(isFinalized());
+ return Tokens;
+ }
+ ArrayRef<Token> tokens(Token::Range R) const {
+ return tokens().slice(R.Begin, R.End - R.Begin);
+ }
+
+ /// May return the end sentinel if the stream is empty.
+ const Token &front() const {
+ assert(isFinalized());
+ return Storage[1];
+ }
+
+ /// Print the tokens in this stream to the output stream.
+ ///
+ /// The presence of newlines/spaces is preserved, but not the quantity.
+ void print(llvm::raw_ostream &) const;
+
+private:
+ std::shared_ptr<void> Payload;
+
+ MutableArrayRef<Token> Tokens;
+ std::vector<Token> Storage; // eof + Tokens + eof
+};
+llvm::raw_ostream &operator<<(llvm::raw_ostream &, const TokenStream &);
+
+/// Extracts a raw token stream from the source code.
+///
+/// All tokens will reference the data of the provided string.
+/// "word-like" tokens such as identifiers and keywords will be raw_identifier.
+TokenStream lex(const std::string &, const clang::LangOptions &);
+enum class LexFlags : uint8_t {
+ /// Marks the token at the start of a logical preprocessor line.
+ /// This is a position where a directive might start.
+ ///
+ /// Here, the first # is StartsPPLine, but second is not (same logical line).
+ /// #define X(error) \
+ /// #error // not a directive!
+ ///
+ /// Careful, the directive may not start exactly on the StartsPPLine token:
+ /// /*comment*/ #include <foo.h>
+ StartsPPLine = 1 << 0,
+ /// Marks tokens containing trigraphs, escaped newlines, UCNs etc.
+ /// The text() of such tokens will contain the raw trigrah.
+ NeedsCleaning = 1 << 1,
+};
+
+/// Derives a token stream by decoding escapes and interpreting raw_identifiers.
+///
+/// Tokens containing UCNs, escaped newlines, trigraphs etc are decoded and
+/// their backing data is owned by the returned stream.
+/// raw_identifier tokens are assigned specific types (identifier, keyword etc).
+///
+/// The StartsPPLine flag is preserved.
+///
+/// Formally the identifier correctly happens before preprocessing, while we
+/// should only cook raw_identifiers that survive preprocessing.
+/// However, ignoring the Token::Kind of tokens in directives achieves the same.
+/// (And having cooked token kinds in PP-disabled sections is useful for us).
+TokenStream cook(const TokenStream &, const clang::LangOptions &);
+
+/// Drops comment tokens.
+TokenStream stripComments(const TokenStream &);
+
+} // namespace pseudo
+} // namespace clang
+
+#endif
--- /dev/null
+set(LLVM_LINK_COMPONENTS Support)
+
+add_clang_library(clangPseudo
+ DirectiveMap.cpp
+ Grammar.cpp
+ GrammarBNF.cpp
+ Lex.cpp
+ LRGraph.cpp
+ LRTable.cpp
+ LRTableBuild.cpp
+ Token.cpp
+
+ LINK_LIBS
+ clangBasic
+ clangLex
+ )
--- /dev/null
+//===--- DirectiveMap.cpp - Find and strip preprocessor directives --------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang-pseudo/DirectiveMap.h"
+#include "clang/Basic/IdentifierTable.h"
+#include "clang/Basic/TokenKinds.h"
+#include "llvm/Support/FormatVariadic.h"
+
+namespace clang {
+namespace pseudo {
+namespace {
+
+class DirectiveParser {
+public:
+ explicit DirectiveParser(const TokenStream &Code)
+ : Code(Code), Tok(&Code.front()) {}
+ void parse(DirectiveMap *Result) { parse(Result, /*TopLevel=*/true); }
+
+private:
+ // Roles that a directive might take within a conditional block.
+ enum class Cond { None, If, Else, End };
+ static Cond classifyDirective(tok::PPKeywordKind K) {
+ switch (K) {
+ case clang::tok::pp_if:
+ case clang::tok::pp_ifdef:
+ case clang::tok::pp_ifndef:
+ return Cond::If;
+ case clang::tok::pp_elif:
+ case clang::tok::pp_elifdef:
+ case clang::tok::pp_elifndef:
+ case clang::tok::pp_else:
+ return Cond::Else;
+ case clang::tok::pp_endif:
+ return Cond::End;
+ default:
+ return Cond::None;
+ }
+ }
+
+ // Parses tokens starting at Tok into Map.
+ // If we reach an End or Else directive that ends Map, returns it.
+ // If TopLevel is true, then we do not expect End and always return None.
+ llvm::Optional<DirectiveMap::Directive> parse(DirectiveMap *Map,
+ bool TopLevel) {
+ auto StartsDirective =
+ [&, AllowDirectiveAt((const Token *)nullptr)]() mutable {
+ if (Tok->flag(LexFlags::StartsPPLine)) {
+ // If we considered a comment at the start of a PP-line, it doesn't
+ // start a directive but the directive can still start after it.
+ if (Tok->Kind == tok::comment)
+ AllowDirectiveAt = Tok + 1;
+ return Tok->Kind == tok::hash;
+ }
+ return Tok->Kind == tok::hash && AllowDirectiveAt == Tok;
+ };
+ // Each iteration adds one chunk (or returns, if we see #endif).
+ while (Tok->Kind != tok::eof) {
+ // If there's no directive here, we have a code chunk.
+ if (!StartsDirective()) {
+ const Token *Start = Tok;
+ do
+ ++Tok;
+ while (Tok->Kind != tok::eof && !StartsDirective());
+ Map->Chunks.push_back(DirectiveMap::Code{
+ Token::Range{Code.index(*Start), Code.index(*Tok)}});
+ continue;
+ }
+
+ // We have some kind of directive.
+ DirectiveMap::Directive Directive;
+ parseDirective(&Directive);
+ Cond Kind = classifyDirective(Directive.Kind);
+ if (Kind == Cond::If) {
+ // #if or similar, starting a nested conditional block.
+ DirectiveMap::Conditional Conditional;
+ Conditional.Branches.emplace_back();
+ Conditional.Branches.back().first = std::move(Directive);
+ parseConditional(&Conditional);
+ Map->Chunks.push_back(std::move(Conditional));
+ } else if ((Kind == Cond::Else || Kind == Cond::End) && !TopLevel) {
+ // #endif or similar, ending this PStructure scope.
+ // (#endif is unexpected at the top level, treat as simple directive).
+ return std::move(Directive);
+ } else {
+ // #define or similar, a simple directive at the current scope.
+ Map->Chunks.push_back(std::move(Directive));
+ }
+ }
+ return None;
+ }
+
+ // Parse the rest of a conditional section, after seeing the If directive.
+ // Returns after consuming the End directive.
+ void parseConditional(DirectiveMap::Conditional *C) {
+ assert(C->Branches.size() == 1 &&
+ C->Branches.front().second.Chunks.empty() &&
+ "Should be ready to parse first branch body");
+ while (Tok->Kind != tok::eof) {
+ auto Terminator = parse(&C->Branches.back().second, /*TopLevel=*/false);
+ if (!Terminator) {
+ assert(Tok->Kind == tok::eof && "gave up parsing before eof?");
+ C->End.Tokens = Token::Range::emptyAt(Code.index(*Tok));
+ return;
+ }
+ if (classifyDirective(Terminator->Kind) == Cond::End) {
+ C->End = std::move(*Terminator);
+ return;
+ }
+ assert(classifyDirective(Terminator->Kind) == Cond::Else &&
+ "ended branch unexpectedly");
+ C->Branches.emplace_back();
+ C->Branches.back().first = std::move(*Terminator);
+ }
+ }
+
+ // Parse a directive. Tok is the hash.
+ void parseDirective(DirectiveMap::Directive *D) {
+ assert(Tok->Kind == tok::hash);
+
+ // Directive spans from the hash until the end of line or file.
+ const Token *Begin = Tok++;
+ while (Tok->Kind != tok::eof && !Tok->flag(LexFlags::StartsPPLine))
+ ++Tok;
+ ArrayRef<Token> Tokens{Begin, Tok};
+ D->Tokens = {Code.index(*Tokens.begin()), Code.index(*Tokens.end())};
+
+ // Directive name is the first non-comment token after the hash.
+ Tokens = Tokens.drop_front().drop_while(
+ [](const Token &T) { return T.Kind == tok::comment; });
+ if (!Tokens.empty())
+ D->Kind = PPKeywords.get(Tokens.front().text()).getPPKeywordID();
+ }
+
+ const TokenStream &Code;
+ const Token *Tok;
+ clang::IdentifierTable PPKeywords;
+};
+
+} // namespace
+
+DirectiveMap DirectiveMap::parse(const TokenStream &Code) {
+ DirectiveMap Result;
+ DirectiveParser(Code).parse(&Result);
+ return Result;
+}
+
+static void dump(llvm::raw_ostream &OS, const DirectiveMap &, unsigned Indent);
+static void dump(llvm::raw_ostream &OS,
+ const DirectiveMap::Directive &Directive, unsigned Indent) {
+ OS.indent(Indent) << llvm::formatv("#{0} ({1} tokens)\n",
+ tok::getPPKeywordSpelling(Directive.Kind),
+ Directive.Tokens.size());
+}
+static void dump(llvm::raw_ostream &OS, const DirectiveMap::Code &Code,
+ unsigned Indent) {
+ OS.indent(Indent) << llvm::formatv("code ({0} tokens)\n", Code.Tokens.size());
+}
+static void dump(llvm::raw_ostream &OS,
+ const DirectiveMap::Conditional &Conditional,
+ unsigned Indent) {
+ for (const auto &Branch : Conditional.Branches) {
+ dump(OS, Branch.first, Indent);
+ dump(OS, Branch.second, Indent + 2);
+ }
+ dump(OS, Conditional.End, Indent);
+}
+
+static void dump(llvm::raw_ostream &OS, const DirectiveMap::Chunk &Chunk,
+ unsigned Indent) {
+ switch (Chunk.kind()) {
+ case DirectiveMap::Chunk::K_Empty:
+ llvm_unreachable("invalid chunk");
+ case DirectiveMap::Chunk::K_Code:
+ return dump(OS, (const DirectiveMap::Code &)Chunk, Indent);
+ case DirectiveMap::Chunk::K_Directive:
+ return dump(OS, (const DirectiveMap::Directive &)Chunk, Indent);
+ case DirectiveMap::Chunk::K_Conditional:
+ return dump(OS, (const DirectiveMap::Conditional &)Chunk, Indent);
+ }
+}
+
+static void dump(llvm::raw_ostream &OS, const DirectiveMap &Map,
+ unsigned Indent) {
+ for (const auto &Chunk : Map.Chunks)
+ dump(OS, Chunk, Indent);
+}
+
+// Define operator<< in terms of dump() functions above.
+#define OSTREAM_DUMP(Type) \
+ llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const Type &T) { \
+ dump(OS, T, 0); \
+ return OS; \
+ }
+OSTREAM_DUMP(DirectiveMap)
+OSTREAM_DUMP(DirectiveMap::Chunk)
+OSTREAM_DUMP(DirectiveMap::Directive)
+OSTREAM_DUMP(DirectiveMap::Conditional)
+OSTREAM_DUMP(DirectiveMap::Code)
+#undef OSTREAM_DUMP
+
+} // namespace pseudo
+} // namespace clang
--- /dev/null
+//===--- Grammar.cpp - Grammar for clang pseudo parser ----------*- C++-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang-pseudo/Grammar.h"
+#include "clang/Basic/TokenKinds.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/FormatVariadic.h"
+#include "llvm/Support/raw_ostream.h"
+
+namespace clang {
+namespace pseudo {
+
+Rule::Rule(SymbolID Target, llvm::ArrayRef<SymbolID> Sequence)
+ : Target(Target), Size(static_cast<uint8_t>(Sequence.size())) {
+ assert(Sequence.size() <= Rule::MaxElements);
+ llvm::copy(Sequence, this->Sequence);
+}
+
+Grammar::Grammar(std::unique_ptr<GrammarTable> Table) : T(std::move(Table)) {
+ // start symbol is named _, binary search it.
+ auto It = llvm::partition_point(
+ T->Nonterminals,
+ [](const GrammarTable::Nonterminal &X) { return X.Name < "_"; });
+ assert(It != T->Nonterminals.end() && It->Name == "_" &&
+ "symbol _ must exist in the grammar!");
+ StartSymbol = It - T->Nonterminals.begin();
+}
+
+llvm::ArrayRef<Rule> Grammar::rulesFor(SymbolID SID) const {
+ assert(isNonterminal(SID));
+ const auto &R = T->Nonterminals[SID].RuleRange;
+ assert(R.end <= T->Rules.size());
+ return llvm::makeArrayRef(&T->Rules[R.start], R.end - R.start);
+}
+
+const Rule &Grammar::lookupRule(RuleID RID) const {
+ assert(RID < T->Rules.size());
+ return T->Rules[RID];
+}
+
+llvm::StringRef Grammar::symbolName(SymbolID SID) const {
+ if (isToken(SID))
+ return T->Terminals[symbolToToken(SID)];
+ return T->Nonterminals[SID].Name;
+}
+
+std::string Grammar::dumpRule(RuleID RID) const {
+ std::string Result;
+ llvm::raw_string_ostream OS(Result);
+ const Rule &R = T->Rules[RID];
+ OS << symbolName(R.Target) << " :=";
+ for (SymbolID SID : R.seq())
+ OS << " " << symbolName(SID);
+ return Result;
+}
+
+std::string Grammar::dumpRules(SymbolID SID) const {
+ assert(isNonterminal(SID));
+ std::string Result;
+ const auto &Range = T->Nonterminals[SID].RuleRange;
+ for (RuleID RID = Range.start; RID < Range.end; ++RID)
+ Result.append(dumpRule(RID)).push_back('\n');
+ return Result;
+}
+
+std::string Grammar::dump() const {
+ std::string Result;
+ llvm::raw_string_ostream OS(Result);
+ OS << "Nonterminals:\n";
+ for (SymbolID SID = 0; SID < T->Nonterminals.size(); ++SID)
+ OS << llvm::formatv(" {0} {1}\n", SID, symbolName(SID));
+ OS << "Rules:\n";
+ for (RuleID RID = 0; RID < T->Rules.size(); ++RID)
+ OS << llvm::formatv(" {0} {1}\n", RID, dumpRule(RID));
+ return OS.str();
+}
+
+std::vector<llvm::DenseSet<SymbolID>> firstSets(const Grammar &G) {
+ std::vector<llvm::DenseSet<SymbolID>> FirstSets(
+ G.table().Nonterminals.size());
+ auto ExpandFirstSet = [&FirstSets](SymbolID Target, SymbolID First) {
+ assert(isNonterminal(Target));
+ if (isToken(First))
+ return FirstSets[Target].insert(First).second;
+ bool Changed = false;
+ for (SymbolID SID : FirstSets[First])
+ Changed |= FirstSets[Target].insert(SID).second;
+ return Changed;
+ };
+
+ // A rule S := T ... implies elements in FIRST(S):
+ // - if T is a terminal, FIRST(S) contains T
+ // - if T is a nonterminal, FIRST(S) contains FIRST(T)
+ // Since FIRST(T) may not have been fully computed yet, FIRST(S) itself may
+ // end up being incomplete.
+ // We iterate until we hit a fixed point.
+ // (This isn't particularly efficient, but table building isn't on the
+ // critical path).
+ bool Changed = true;
+ while (Changed) {
+ Changed = false;
+ for (const auto &R : G.table().Rules)
+ // We only need to consider the first element because symbols are
+ // non-nullable.
+ Changed |= ExpandFirstSet(R.Target, R.seq().front());
+ }
+ return FirstSets;
+}
+
+std::vector<llvm::DenseSet<SymbolID>> followSets(const Grammar &G) {
+ auto FirstSets = firstSets(G);
+ std::vector<llvm::DenseSet<SymbolID>> FollowSets(
+ G.table().Nonterminals.size());
+ // Expand the follow set of a non-terminal symbol Y by adding all from the
+ // given symbol set.
+ auto ExpandFollowSet = [&FollowSets](SymbolID Y,
+ const llvm::DenseSet<SymbolID> &ToAdd) {
+ assert(isNonterminal(Y));
+ bool Changed = false;
+ for (SymbolID F : ToAdd)
+ Changed |= FollowSets[Y].insert(F).second;
+ return Changed;
+ };
+ // Follow sets is computed based on the following 3 rules, the computation
+ // is completed at a fixed point where there is no more new symbols can be
+ // added to any of the follow sets.
+ //
+ // Rule 1: add endmarker to the FOLLOW(S), where S is the start symbol.
+ FollowSets[G.startSymbol()].insert(tokenSymbol(tok::eof));
+ bool Changed = true;
+ while (Changed) {
+ Changed = false;
+ for (const auto &R : G.table().Rules) {
+ // Rule 2: for a rule X := ... Y Z, we add all symbols from FIRST(Z) to
+ // FOLLOW(Y).
+ for (size_t i = 0; i + 1 < R.seq().size(); ++i) {
+ if (isToken(R.seq()[i]))
+ continue;
+ // We only need to consider the next symbol because symbols are
+ // non-nullable.
+ SymbolID Next = R.seq()[i + 1];
+ if (isToken(Next))
+ // First set for a terminal is itself.
+ Changed |= ExpandFollowSet(R.seq()[i], {Next});
+ else
+ Changed |= ExpandFollowSet(R.seq()[i], FirstSets[Next]);
+ }
+ // Rule 3: for a rule X := ... Z, we add all symbols from FOLLOW(X) to
+ // FOLLOW(Z).
+ SymbolID Z = R.seq().back();
+ if (isNonterminal(Z))
+ Changed |= ExpandFollowSet(Z, FollowSets[R.Target]);
+ }
+ }
+ return FollowSets;
+}
+
+static llvm::ArrayRef<std::string> getTerminalNames() {
+ static const std::vector<std::string> *TerminalNames = []() {
+ static std::vector<std::string> TerminalNames;
+ TerminalNames.reserve(NumTerminals);
+ for (unsigned I = 0; I < NumTerminals; ++I) {
+ tok::TokenKind K = static_cast<tok::TokenKind>(I);
+ if (const auto *Punc = tok::getPunctuatorSpelling(K))
+ TerminalNames.push_back(Punc);
+ else
+ TerminalNames.push_back(llvm::StringRef(tok::getTokenName(K)).upper());
+ }
+ return &TerminalNames;
+ }();
+ return *TerminalNames;
+}
+GrammarTable::GrammarTable() : Terminals(getTerminalNames()) {}
+
+} // namespace pseudo
+} // namespace clang
--- /dev/null
+//===--- GrammarBNF.cpp - build grammar from BNF files ----------*- C++-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang-pseudo/Grammar.h"
+#include "clang/Basic/TokenKinds.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/FormatVariadic.h"
+#include <memory>
+
+namespace clang {
+namespace pseudo {
+
+namespace {
+static const llvm::StringRef OptSuffix = "_opt";
+static const llvm::StringRef StartSymbol = "_";
+
+// Builds grammar from BNF files.
+class GrammarBuilder {
+public:
+ GrammarBuilder(std::vector<std::string> &Diagnostics)
+ : Diagnostics(Diagnostics) {}
+
+ std::unique_ptr<Grammar> build(llvm::StringRef BNF) {
+ auto Specs = eliminateOptional(parse(BNF));
+
+ assert(llvm::all_of(Specs,
+ [](const RuleSpec &R) {
+ if (R.Target.endswith(OptSuffix))
+ return false;
+ return llvm::all_of(
+ R.Sequence, [](const RuleSpec::Element &E) {
+ return !E.Symbol.endswith(OptSuffix);
+ });
+ }) &&
+ "Optional symbols should be eliminated!");
+
+ auto T = std::make_unique<GrammarTable>();
+
+ // Assemble the name->ID and ID->nonterminal name maps.
+ llvm::DenseSet<llvm::StringRef> UniqueNonterminals;
+ llvm::DenseMap<llvm::StringRef, SymbolID> SymbolIds;
+ for (uint16_t I = 0; I < NumTerminals; ++I)
+ SymbolIds.try_emplace(T->Terminals[I], tokenSymbol(tok::TokenKind(I)));
+ auto Consider = [&](llvm::StringRef Name) {
+ if (!SymbolIds.count(Name))
+ UniqueNonterminals.insert(Name);
+ };
+ for (const auto &Spec : Specs) {
+ Consider(Spec.Target);
+ for (const RuleSpec::Element &Elt : Spec.Sequence)
+ Consider(Elt.Symbol);
+ }
+ llvm::for_each(UniqueNonterminals, [&T](llvm::StringRef Name) {
+ T->Nonterminals.emplace_back();
+ T->Nonterminals.back().Name = Name.str();
+ });
+ assert(T->Nonterminals.size() < (1 << (SymbolBits - 1)) &&
+ "Too many nonterminals to fit in SymbolID bits!");
+ llvm::sort(T->Nonterminals, [](const GrammarTable::Nonterminal &L,
+ const GrammarTable::Nonterminal &R) {
+ return L.Name < R.Name;
+ });
+ // Build name -> ID maps for nonterminals.
+ for (SymbolID SID = 0; SID < T->Nonterminals.size(); ++SID)
+ SymbolIds.try_emplace(T->Nonterminals[SID].Name, SID);
+
+ // Convert the rules.
+ T->Rules.reserve(Specs.size());
+ std::vector<SymbolID> Symbols;
+ auto Lookup = [SymbolIds](llvm::StringRef Name) {
+ auto It = SymbolIds.find(Name);
+ assert(It != SymbolIds.end() && "Didn't find the symbol in SymbolIds!");
+ return It->second;
+ };
+ for (const auto &Spec : Specs) {
+ assert(Spec.Sequence.size() <= Rule::MaxElements);
+ Symbols.clear();
+ for (const RuleSpec::Element &Elt : Spec.Sequence)
+ Symbols.push_back(Lookup(Elt.Symbol));
+ T->Rules.push_back(Rule(Lookup(Spec.Target), Symbols));
+ }
+ assert(T->Rules.size() < (1 << RuleBits) &&
+ "Too many rules to fit in RuleID bits!");
+ llvm::sort(T->Rules, [](const Rule &Left, const Rule &Right) {
+ // Sorted by the Target.
+ return std::tie(Left.Target, Left.Size) <
+ std::tie(Right.Target, Right.Size);
+ });
+ RuleID RulePos = 0;
+ for (SymbolID SID = 0; SID < T->Nonterminals.size(); ++SID) {
+ RuleID Start = RulePos;
+ while (RulePos < T->Rules.size() && T->Rules[RulePos].Target == SID)
+ ++RulePos;
+ T->Nonterminals[SID].RuleRange = {Start, RulePos};
+ }
+ auto G = std::make_unique<Grammar>(std::move(T));
+ diagnoseGrammar(*G);
+ return G;
+ }
+
+private:
+ // Text representation of a BNF grammar rule.
+ struct RuleSpec {
+ llvm::StringRef Target;
+ struct Element {
+ llvm::StringRef Symbol; // Name of the symbol
+ };
+ std::vector<Element> Sequence;
+
+ std::string toString() const {
+ std::vector<llvm::StringRef> Body;
+ for (const auto &E : Sequence)
+ Body.push_back(E.Symbol);
+ return llvm::formatv("{0} := {1}", Target, llvm::join(Body, " "));
+ }
+ };
+
+ std::vector<RuleSpec> parse(llvm::StringRef Lines) {
+ std::vector<RuleSpec> Specs;
+ for (llvm::StringRef Line : llvm::split(Lines, '\n')) {
+ Line = Line.trim();
+ // Strip anything coming after the '#' (comment).
+ Line = Line.take_while([](char C) { return C != '#'; });
+ if (Line.empty())
+ continue;
+ RuleSpec Rule;
+ if (parseLine(Line, Rule))
+ Specs.push_back(std::move(Rule));
+ }
+ return Specs;
+ }
+
+ bool parseLine(llvm::StringRef Line, RuleSpec &Out) {
+ auto Parts = Line.split(":=");
+ if (Parts.first == Line) { // no separator in Line
+ Diagnostics.push_back(
+ llvm::formatv("Failed to parse '{0}': no separator :=", Line).str());
+ return false;
+ }
+
+ Out.Target = Parts.first.trim();
+ Out.Sequence.clear();
+ for (llvm::StringRef Chunk : llvm::split(Parts.second, ' ')) {
+ Chunk = Chunk.trim();
+ if (Chunk.empty())
+ continue; // skip empty
+
+ Out.Sequence.push_back({Chunk});
+ }
+ return true;
+ };
+
+ // Inlines all _opt symbols.
+ // For example, a rule E := id +_opt id, after elimination, we have two
+ // equivalent rules:
+ // 1) E := id + id
+ // 2) E := id id
+ std::vector<RuleSpec> eliminateOptional(llvm::ArrayRef<RuleSpec> Input) {
+ std::vector<RuleSpec> Results;
+ std::vector<RuleSpec::Element> Storage;
+ for (const auto &R : Input) {
+ eliminateOptionalTail(
+ R.Sequence, Storage, [&Results, &Storage, &R, this]() {
+ if (Storage.empty()) {
+ Diagnostics.push_back(
+ llvm::formatv("Rule '{0}' has a nullable RHS", R.toString()));
+ return;
+ }
+ Results.push_back({R.Target, Storage});
+ });
+ assert(Storage.empty());
+ }
+ return Results;
+ }
+ void eliminateOptionalTail(llvm::ArrayRef<RuleSpec::Element> Elements,
+ std::vector<RuleSpec::Element> &Result,
+ llvm::function_ref<void()> CB) {
+ if (Elements.empty())
+ return CB();
+ auto Front = Elements.front();
+ if (!Front.Symbol.endswith(OptSuffix)) {
+ Result.push_back(std::move(Front));
+ eliminateOptionalTail(Elements.drop_front(1), Result, CB);
+ Result.pop_back();
+ return;
+ }
+ // Enumerate two options: skip the opt symbol, or inline the symbol.
+ eliminateOptionalTail(Elements.drop_front(1), Result, CB); // skip
+ Front.Symbol = Front.Symbol.drop_back(OptSuffix.size()); // drop "_opt"
+ Result.push_back(std::move(Front));
+ eliminateOptionalTail(Elements.drop_front(1), Result, CB);
+ Result.pop_back();
+ }
+
+ // Diagnoses the grammar and emit warnings if any.
+ void diagnoseGrammar(const Grammar &G) {
+ const auto &T = G.table();
+ for (SymbolID SID = 0; SID < T.Nonterminals.size(); ++SID) {
+ auto Range = T.Nonterminals[SID].RuleRange;
+ if (Range.start == Range.end)
+ Diagnostics.push_back(
+ llvm::formatv("No rules for nonterminal: {0}", G.symbolName(SID)));
+ llvm::StringRef NameRef = T.Nonterminals[SID].Name;
+ if (llvm::all_of(NameRef, llvm::isAlpha) && NameRef.upper() == NameRef) {
+ Diagnostics.push_back(llvm::formatv(
+ "Token-like name {0} is used as a nonterminal", G.symbolName(SID)));
+ }
+ }
+ for (RuleID RID = 0; RID + 1u < T.Rules.size(); ++RID) {
+ if (T.Rules[RID] == T.Rules[RID + 1])
+ Diagnostics.push_back(
+ llvm::formatv("Duplicate rule: `{0}`", G.dumpRule(RID)));
+ // Warning for nullable non-terminals
+ if (T.Rules[RID].Size == 0)
+ Diagnostics.push_back(
+ llvm::formatv("Rule `{0}` has a nullable RHS", G.dumpRule(RID)));
+ }
+ // symbol-id -> used counts
+ std::vector<unsigned> UseCounts(T.Nonterminals.size(), 0);
+ for (const Rule &R : T.Rules)
+ for (SymbolID SID : R.seq())
+ if (isNonterminal(SID))
+ ++UseCounts[SID];
+ for (SymbolID SID = 0; SID < UseCounts.size(); ++SID)
+ if (UseCounts[SID] == 0 && T.Nonterminals[SID].Name != StartSymbol)
+ Diagnostics.push_back(
+ llvm::formatv("Nonterminal never used: {0}", G.symbolName(SID)));
+ }
+ std::vector<std::string> &Diagnostics;
+};
+} // namespace
+
+std::unique_ptr<Grammar>
+Grammar::parseBNF(llvm::StringRef BNF, std::vector<std::string> &Diagnostics) {
+ Diagnostics.clear();
+ return GrammarBuilder(Diagnostics).build(BNF);
+}
+
+} // namespace pseudo
+} // namespace clang
--- /dev/null
+//===--- LRGraph.cpp - -------------------------------------------*- C++-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang-pseudo/LRGraph.h"
+#include "clang-pseudo/Grammar.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/Hashing.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/FormatVariadic.h"
+#include "llvm/Support/raw_ostream.h"
+
+using ItemSet = std::vector<clang::pseudo::Item>;
+
+namespace llvm {
+// Support clang::pseudo::Item as DenseMap keys.
+template <> struct DenseMapInfo<ItemSet> {
+ static inline ItemSet getEmptyKey() {
+ return {DenseMapInfo<clang::pseudo::Item>::getEmptyKey()};
+ }
+ static inline ItemSet getTombstoneKey() {
+ return {DenseMapInfo<clang::pseudo::Item>::getTombstoneKey()};
+ }
+ static unsigned getHashValue(const ItemSet &I) {
+ return llvm::hash_combine_range(I.begin(), I.end());
+ }
+ static bool isEqual(const ItemSet &LHS, const ItemSet &RHS) {
+ return LHS == RHS;
+ }
+};
+} // namespace llvm
+
+namespace clang {
+namespace pseudo {
+namespace {
+
+struct SortByNextSymbol {
+ SortByNextSymbol(const Grammar &G) : G(G) {}
+ bool operator()(const Item &L, const Item &R) {
+ if (L.hasNext() && R.hasNext() && L.next(G) != R.next(G))
+ return L.next(G) < R.next(G);
+ if (L.hasNext() != R.hasNext())
+ return L.hasNext() < R.hasNext(); // a trailing dot is minimal.
+ return L < R;
+ }
+ const Grammar &G;
+};
+
+// Computes a closure of the given item set S:
+// - extends the given S to contain all options for parsing next token;
+// - nonterminals after a dot are recursively expanded into the begin-state
+// of all production rules that produce that nonterminal;
+//
+// Given
+// Grammar rules = [ _ := E, E := E - T, E := T, T := n, T := ( E ) ]
+// Input = [ E := . T ]
+// returns [ E := . T, T := . n, T := . ( E ) ]
+State closure(ItemSet Queue, const Grammar &G) {
+ llvm::DenseSet<Item> InQueue = {Queue.begin(), Queue.end()};
+ // We reuse the passed-by-value Queue as the final result, as it's already
+ // initialized to the right elements.
+ size_t ItIndex = 0;
+ while (ItIndex < Queue.size()) {
+ const Item &ExpandingItem = Queue[ItIndex];
+ ++ItIndex;
+ if (!ExpandingItem.hasNext())
+ continue;
+
+ SymbolID NextSym = ExpandingItem.next(G);
+ if (pseudo::isToken(NextSym))
+ continue;
+ auto RRange = G.table().Nonterminals[NextSym].RuleRange;
+ for (RuleID RID = RRange.start; RID < RRange.end; ++RID) {
+ Item NewItem = Item::start(RID, G);
+ if (InQueue.insert(NewItem).second) // new
+ Queue.push_back(std::move(NewItem));
+ }
+ }
+ Queue.shrink_to_fit();
+ llvm::sort(Queue, SortByNextSymbol(G));
+ return {std::move(Queue)};
+}
+
+// Returns all next (with a dot advanced) kernel item sets, partitioned by the
+// advanced symbol.
+//
+// Given
+// S = [ E := . a b, E := E . - T ]
+// returns [
+// {id(a), [ E := a . b ]},
+// {id(-), [ E := E - . T ]}
+// ]
+std::vector<std::pair<SymbolID, ItemSet>>
+nextAvailableKernelItems(const State &S, const Grammar &G) {
+ std::vector<std::pair<SymbolID, ItemSet>> Results;
+ llvm::ArrayRef<Item> AllItems = S.Items;
+ AllItems = AllItems.drop_while([](const Item &I) { return !I.hasNext(); });
+ while (!AllItems.empty()) {
+ SymbolID AdvancedSymbol = AllItems.front().next(G);
+ auto Batch = AllItems.take_while([AdvancedSymbol, &G](const Item &I) {
+ assert(I.hasNext());
+ return I.next(G) == AdvancedSymbol;
+ });
+ assert(!Batch.empty());
+ AllItems = AllItems.drop_front(Batch.size());
+
+ // Advance a dot over the Symbol.
+ ItemSet Next;
+ for (const Item &I : Batch)
+ Next.push_back(I.advance());
+ // sort the set to keep order determinism for hash computation.
+ llvm::sort(Next);
+ Results.push_back({AdvancedSymbol, std::move(Next)});
+ }
+ return Results;
+}
+
+} // namespace
+
+std::string Item::dump(const Grammar &G) const {
+ const auto &Rule = G.lookupRule(RID);
+ auto ToNames = [&](llvm::ArrayRef<SymbolID> Syms) {
+ std::vector<llvm::StringRef> Results;
+ for (auto SID : Syms)
+ Results.push_back(G.symbolName(SID));
+ return Results;
+ };
+ return llvm::formatv("{0} := {1} • {2}", G.symbolName(Rule.Target),
+ llvm::join(ToNames(Rule.seq().take_front(DotPos)), " "),
+ llvm::join(ToNames(Rule.seq().drop_front(DotPos)), " "))
+ .str();
+}
+
+std::string State::dump(const Grammar &G, unsigned Indent) const {
+ std::string Result;
+ llvm::raw_string_ostream OS(Result);
+ for (const auto &Item : Items)
+ OS.indent(Indent) << llvm::formatv("{0}\n", Item.dump(G));
+ return OS.str();
+}
+
+std::string LRGraph::dumpForTests(const Grammar &G) const {
+ std::string Result;
+ llvm::raw_string_ostream OS(Result);
+ OS << "States:\n";
+ for (StateID ID = 0; ID < States.size(); ++ID) {
+ OS << llvm::formatv("State {0}\n", ID);
+ OS << States[ID].dump(G, /*Indent*/ 4);
+ }
+ for (const auto &E : Edges) {
+ OS << llvm::formatv("{0} ->[{1}] {2}\n", E.Src, G.symbolName(E.Label),
+ E.Dst);
+ }
+ return OS.str();
+}
+
+LRGraph LRGraph::buildLR0(const Grammar &G) {
+ class Builder {
+ public:
+ Builder(const Grammar &G) : G(G) {}
+
+ // Adds a given state if not existed.
+ std::pair<StateID, /*inserted*/ bool> insert(ItemSet KernelItems) {
+ assert(llvm::is_sorted(KernelItems) &&
+ "Item must be sorted before inserting to a hash map!");
+ auto It = StatesIndex.find(KernelItems);
+ if (It != StatesIndex.end())
+ return {It->second, false};
+ States.push_back(closure(KernelItems, G));
+ StateID NextStateID = States.size() - 1;
+ StatesIndex.insert({std::move(KernelItems), NextStateID});
+ return {NextStateID, true};
+ }
+
+ void insertEdge(StateID Src, StateID Dst, SymbolID Label) {
+ Edges.push_back({Src, Dst, Label});
+ }
+
+ // Returns a state with the given id.
+ const State &find(StateID ID) const {
+ assert(ID < States.size());
+ return States[ID];
+ }
+
+ LRGraph build() && {
+ States.shrink_to_fit();
+ Edges.shrink_to_fit();
+ return LRGraph(std::move(States), std::move(Edges));
+ }
+
+ private:
+ // Key is the **kernel** item sets.
+ llvm::DenseMap<ItemSet, /*index of States*/ size_t> StatesIndex;
+ std::vector<State> States;
+ std::vector<Edge> Edges;
+ const Grammar &G;
+ } Builder(G);
+
+ std::vector<StateID> PendingStates;
+ // Initialize states with the start symbol.
+ auto RRange = G.table().Nonterminals[G.startSymbol()].RuleRange;
+ for (RuleID RID = RRange.start; RID < RRange.end; ++RID) {
+ auto StartState = std::vector<Item>{Item::start(RID, G)};
+ auto Result = Builder.insert(std::move(StartState));
+ assert(Result.second && "State must be new");
+ PendingStates.push_back(Result.first);
+ }
+
+ while (!PendingStates.empty()) {
+ auto CurrentStateID = PendingStates.back();
+ PendingStates.pop_back();
+ for (auto Next :
+ nextAvailableKernelItems(Builder.find(CurrentStateID), G)) {
+ auto Insert = Builder.insert(Next.second);
+ if (Insert.second) // new state, insert to the pending queue.
+ PendingStates.push_back(Insert.first);
+ Builder.insertEdge(CurrentStateID, Insert.first, Next.first);
+ }
+ }
+ return std::move(Builder).build();
+}
+
+} // namespace pseudo
+} // namespace clang
--- /dev/null
+//===--- LRTable.cpp - Parsing table for LR parsers --------------*- C++-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang-pseudo/LRTable.h"
+#include "clang-pseudo/Grammar.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FormatVariadic.h"
+#include "llvm/Support/raw_ostream.h"
+
+namespace clang {
+namespace pseudo {
+
+llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const LRTable::Action &A) {
+ switch (A.kind()) {
+ case LRTable::Action::Shift:
+ return OS << llvm::formatv("shift state {0}", A.getShiftState());
+ case LRTable::Action::Reduce:
+ return OS << llvm::formatv("reduce by rule {0}", A.getReduceRule());
+ case LRTable::Action::GoTo:
+ return OS << llvm::formatv("go to state {0}", A.getGoToState());
+ case LRTable::Action::Accept:
+ return OS << "acc";
+ case LRTable::Action::Sentinel:
+ llvm_unreachable("unexpected Sentinel action kind!");
+ }
+ llvm_unreachable("unexpected action kind!");
+}
+
+std::string LRTable::dumpStatistics() const {
+ StateID NumOfStates = 0;
+ for (StateID It : States)
+ NumOfStates = std::max(It, NumOfStates);
+ return llvm::formatv(R"(
+Statistics of the LR parsing table:
+ number of states: {0}
+ number of actions: {1}
+ size of the table (bytes): {2}
+)",
+ NumOfStates, Actions.size(), bytes())
+ .str();
+}
+
+std::string LRTable::dumpForTests(const Grammar &G) const {
+ std::string Result;
+ llvm::raw_string_ostream OS(Result);
+ StateID MaxState = 0;
+ for (StateID It : States)
+ MaxState = std::max(MaxState, It);
+ OS << "LRTable:\n";
+ for (StateID S = 0; S <= MaxState; ++S) {
+ OS << llvm::formatv("State {0}\n", S);
+ for (uint16_t Terminal = 0; Terminal < NumTerminals; ++Terminal) {
+ SymbolID TokID = tokenSymbol(static_cast<tok::TokenKind>(Terminal));
+ for (auto A : find(S, TokID)) {
+ if (A.kind() == LRTable::Action::Shift)
+ OS.indent(4) << llvm::formatv("'{0}': shift state {1}\n",
+ G.symbolName(TokID), A.getShiftState());
+ else if (A.kind() == LRTable::Action::Reduce)
+ OS.indent(4) << llvm::formatv("'{0}': reduce by rule {1} '{2}'\n",
+ G.symbolName(TokID), A.getReduceRule(),
+ G.dumpRule(A.getReduceRule()));
+ else if (A.kind() == LRTable::Action::Accept)
+ OS.indent(4) << llvm::formatv("'{0}': accept\n", G.symbolName(TokID));
+ }
+ }
+ for (SymbolID NontermID = 0; NontermID < G.table().Nonterminals.size();
+ ++NontermID) {
+ if (find(S, NontermID).empty())
+ continue;
+ OS.indent(4) << llvm::formatv("'{0}': go to state {1}\n",
+ G.symbolName(NontermID),
+ getGoToState(S, NontermID));
+ }
+ }
+ return OS.str();
+}
+
+llvm::ArrayRef<LRTable::Action> LRTable::getActions(StateID State,
+ SymbolID Terminal) const {
+ assert(pseudo::isToken(Terminal) && "expect terminal symbol!");
+ return find(State, Terminal);
+}
+
+LRTable::StateID LRTable::getGoToState(StateID State,
+ SymbolID Nonterminal) const {
+ assert(pseudo::isNonterminal(Nonterminal) && "expected nonterminal symbol!");
+ auto Result = find(State, Nonterminal);
+ assert(Result.size() == 1 && Result.front().kind() == Action::GoTo);
+ return Result.front().getGoToState();
+}
+
+llvm::ArrayRef<LRTable::Action> LRTable::find(StateID Src, SymbolID ID) const {
+ size_t Idx = isToken(ID) ? symbolToToken(ID) : ID;
+ assert(isToken(ID) ? Idx + 1 < TerminalOffset.size()
+ : Idx + 1 < NontermOffset.size());
+ std::pair<size_t, size_t> TargetStateRange =
+ isToken(ID) ? std::make_pair(TerminalOffset[Idx], TerminalOffset[Idx + 1])
+ : std::make_pair(NontermOffset[Idx], NontermOffset[Idx + 1]);
+ auto TargetedStates =
+ llvm::makeArrayRef(States.data() + TargetStateRange.first,
+ States.data() + TargetStateRange.second);
+
+ assert(llvm::is_sorted(TargetedStates) &&
+ "subrange of the StateIdx should be sorted!");
+ const LRTable::StateID *Start = llvm::partition_point(
+ TargetedStates, [&Src](LRTable::StateID S) { return S < Src; });
+ if (Start == TargetedStates.end())
+ return {};
+ const LRTable::StateID *End = Start;
+ while (End != TargetedStates.end() && *End == Src)
+ ++End;
+ return llvm::makeArrayRef(&Actions[Start - States.data()],
+ /*length=*/End - Start);
+}
+
+} // namespace pseudo
+} // namespace clang
--- /dev/null
+//===--- LRTableBuild.cpp - Build a LRTable from LRGraph ---------*- C++-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang-pseudo/Grammar.h"
+#include "clang-pseudo/LRGraph.h"
+#include "clang-pseudo/LRTable.h"
+#include "clang/Basic/TokenKinds.h"
+#include <cstdint>
+
+namespace llvm {
+template <> struct DenseMapInfo<clang::pseudo::LRTable::Entry> {
+ using Entry = clang::pseudo::LRTable::Entry;
+ static inline Entry getEmptyKey() {
+ static Entry E{static_cast<clang::pseudo::SymbolID>(-1), 0,
+ clang::pseudo::LRTable::Action::sentinel()};
+ return E;
+ }
+ static inline Entry getTombstoneKey() {
+ static Entry E{static_cast<clang::pseudo::SymbolID>(-2), 0,
+ clang::pseudo::LRTable::Action::sentinel()};
+ return E;
+ }
+ static unsigned getHashValue(const Entry &I) {
+ return llvm::hash_combine(I.State, I.Symbol, I.Act.opaque());
+ }
+ static bool isEqual(const Entry &LHS, const Entry &RHS) {
+ return LHS.State == RHS.State && LHS.Symbol == RHS.Symbol &&
+ LHS.Act == RHS.Act;
+ }
+};
+} // namespace llvm
+
+namespace clang {
+namespace pseudo {
+
+class LRTable::Builder {
+public:
+ bool insert(Entry E) { return Entries.insert(std::move(E)).second; }
+ LRTable build(const GrammarTable >) && {
+ // E.g. given the following parsing table with 3 states and 3 terminals:
+ //
+ // a b c
+ // +-------+----+-------+-+
+ // |state0 | | s0,r0 | |
+ // |state1 | acc| | |
+ // |state2 | | r1 | |
+ // +-------+----+-------+-+
+ //
+ // The final LRTable:
+ // - TerminalOffset: [a] = 0, [b] = 1, [c] = 4, [d] = 4 (d is a sentinel)
+ // - States: [ 1, 0, 0, 2]
+ // Actions: [ acc, s0, r0, r1]
+ // ~~~ corresponding range for terminal a
+ // ~~~~~~~~~~ corresponding range for terminal b
+ // First step, we sort all entries by (Symbol, State, Action).
+ std::vector<Entry> Sorted(Entries.begin(), Entries.end());
+ llvm::sort(Sorted, [](const Entry &L, const Entry &R) {
+ return std::forward_as_tuple(L.Symbol, L.State, L.Act.opaque()) <
+ std::forward_as_tuple(R.Symbol, R.State, R.Act.opaque());
+ });
+
+ LRTable Table;
+ Table.Actions.reserve(Sorted.size());
+ Table.States.reserve(Sorted.size());
+ // We are good to finalize the States and Actions.
+ for (const auto &E : Sorted) {
+ Table.Actions.push_back(E.Act);
+ Table.States.push_back(E.State);
+ }
+ // Initialize the terminal and nonterminal offset, all ranges are empty by
+ // default.
+ Table.TerminalOffset = std::vector<uint32_t>(GT.Terminals.size() + 1, 0);
+ Table.NontermOffset = std::vector<uint32_t>(GT.Nonterminals.size() + 1, 0);
+ size_t SortedIndex = 0;
+ for (SymbolID NonterminalID = 0; NonterminalID < Table.NontermOffset.size();
+ ++NonterminalID) {
+ Table.NontermOffset[NonterminalID] = SortedIndex;
+ while (SortedIndex < Sorted.size() &&
+ Sorted[SortedIndex].Symbol == NonterminalID)
+ ++SortedIndex;
+ }
+ for (size_t Terminal = 0; Terminal < Table.TerminalOffset.size();
+ ++Terminal) {
+ Table.TerminalOffset[Terminal] = SortedIndex;
+ while (SortedIndex < Sorted.size() &&
+ Sorted[SortedIndex].Symbol ==
+ tokenSymbol(static_cast<tok::TokenKind>(Terminal)))
+ ++SortedIndex;
+ }
+ return Table;
+ }
+
+private:
+ llvm::DenseSet<Entry> Entries;
+};
+
+LRTable LRTable::buildForTests(const GrammarTable >,
+ llvm::ArrayRef<Entry> Entries) {
+ Builder Build;
+ for (const Entry &E : Entries)
+ Build.insert(E);
+ return std::move(Build).build(GT);
+}
+
+LRTable LRTable::buildSLR(const Grammar &G) {
+ Builder Build;
+ auto Graph = LRGraph::buildLR0(G);
+ for (const auto &T : Graph.edges()) {
+ Action Act = isToken(T.Label) ? Action::shift(T.Dst) : Action::goTo(T.Dst);
+ Build.insert({T.Src, T.Label, Act});
+ }
+ assert(Graph.states().size() <= (1 << StateBits) &&
+ "Graph states execceds the maximum limit!");
+ auto FollowSets = followSets(G);
+ for (StateID SID = 0; SID < Graph.states().size(); ++SID) {
+ for (const Item &I : Graph.states()[SID].Items) {
+ // If we've just parsed the start symbol, we can accept the input.
+ if (G.lookupRule(I.rule()).Target == G.startSymbol() && !I.hasNext()) {
+ Build.insert({SID, tokenSymbol(tok::eof), Action::accept(I.rule())});
+ continue;
+ }
+ if (!I.hasNext()) {
+ // If we've reached the end of a rule A := ..., then we can reduce if
+ // the next token is in the follow set of A.
+ for (SymbolID Follow : FollowSets[G.lookupRule(I.rule()).Target]) {
+ assert(isToken(Follow));
+ Build.insert({SID, Follow, Action::reduce(I.rule())});
+ }
+ }
+ }
+ }
+ return std::move(Build).build(G.table());
+}
+
+} // namespace pseudo
+} // namespace clang
--- /dev/null
+//===--- Lex.cpp - extract token stream from source code ---------*- C++-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang-pseudo/Token.h"
+#include "clang/Basic/SourceLocation.h"
+#include "clang/Basic/TokenKinds.h"
+#include "clang/Lex/Lexer.h"
+#include "clang/Lex/LiteralSupport.h"
+
+namespace clang {
+namespace pseudo {
+
+TokenStream lex(const std::string &Code, const clang::LangOptions &LangOpts) {
+ clang::SourceLocation Start;
+ // Tokenize using clang's lexer in raw mode.
+ // std::string guarantees null-termination, which the lexer needs.
+ clang::Lexer Lexer(Start, LangOpts, Code.data(), Code.data(),
+ Code.data() + Code.size());
+ Lexer.SetCommentRetentionState(true);
+
+ TokenStream Result;
+ clang::Token CT;
+ unsigned LastOffset = 0;
+ unsigned Line = 0;
+ unsigned Indent = 0;
+ for (Lexer.LexFromRawLexer(CT); CT.getKind() != clang::tok::eof;
+ Lexer.LexFromRawLexer(CT)) {
+ unsigned Offset =
+ CT.getLocation().getRawEncoding() - Start.getRawEncoding();
+
+ Token Tok;
+ Tok.Data = &Code[Offset];
+ Tok.Length = CT.getLength();
+ Tok.Kind = CT.getKind();
+
+ // Update current line number and indentation from raw source code.
+ unsigned NewLineStart = 0;
+ for (unsigned i = LastOffset; i < Offset; ++i) {
+ if (Code[i] == '\n') {
+ NewLineStart = i + 1;
+ ++Line;
+ }
+ }
+ if (NewLineStart || !LastOffset) {
+ Indent = 0;
+ for (char c : StringRef(Code).slice(NewLineStart, Offset)) {
+ if (c == ' ')
+ ++Indent;
+ else if (c == '\t')
+ Indent += 8;
+ else
+ break;
+ }
+ }
+ Tok.Indent = Indent;
+ Tok.Line = Line;
+
+ if (CT.isAtStartOfLine())
+ Tok.setFlag(LexFlags::StartsPPLine);
+ if (CT.needsCleaning() || CT.hasUCN())
+ Tok.setFlag(LexFlags::NeedsCleaning);
+
+ Result.push(Tok);
+ LastOffset = Offset;
+ }
+ Result.finalize();
+ return Result;
+}
+
+TokenStream cook(const TokenStream &Code, const LangOptions &LangOpts) {
+ auto CleanedStorage = std::make_shared<llvm::BumpPtrAllocator>();
+ clang::IdentifierTable Identifiers(LangOpts);
+ TokenStream Result(CleanedStorage);
+
+ for (auto Tok : Code.tokens()) {
+ if (Tok.flag(LexFlags::NeedsCleaning)) {
+ // Remove escaped newlines and trigraphs.
+ llvm::SmallString<64> CleanBuffer;
+ const char *Pos = Tok.text().begin();
+ while (Pos < Tok.text().end()) {
+ unsigned CharSize = 0;
+ CleanBuffer.push_back(
+ clang::Lexer::getCharAndSizeNoWarn(Pos, CharSize, LangOpts));
+ assert(CharSize != 0 && "no progress!");
+ Pos += CharSize;
+ }
+ // Remove universal character names (UCN).
+ llvm::SmallString<64> UCNBuffer;
+ clang::expandUCNs(UCNBuffer, CleanBuffer);
+
+ llvm::StringRef Text = llvm::StringRef(UCNBuffer).copy(*CleanedStorage);
+ Tok.Data = Text.data();
+ Tok.Length = Text.size();
+ Tok.Flags &= ~static_cast<decltype(Tok.Flags)>(LexFlags::NeedsCleaning);
+ }
+ // Cook raw_identifiers into identifier, keyword, etc.
+ if (Tok.Kind == tok::raw_identifier)
+ Tok.Kind = Identifiers.get(Tok.text()).getTokenID();
+ Result.push(std::move(Tok));
+ }
+
+ Result.finalize();
+ return Result;
+}
+
+} // namespace pseudo
+} // namespace clang
--- /dev/null
+//===--- Token.cpp - Tokens and token streams in the pseudoparser ---------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang-pseudo/Token.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/FormatVariadic.h"
+
+namespace clang {
+namespace pseudo {
+
+llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const Token &T) {
+ OS << llvm::formatv("{0} {1}:{2} ", clang::tok::getTokenName(T.Kind), T.Line,
+ T.Indent);
+ OS << '"';
+ llvm::printEscapedString(T.text(), OS);
+ OS << '"';
+ if (T.Flags)
+ OS << llvm::format(" flags=%x", T.Flags);
+ return OS;
+}
+
+llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const TokenStream &TS) {
+ OS << "Index Kind Line Text\n";
+ for (const auto &T : TS.tokens()) {
+ OS << llvm::format("%5d: %16s %4d:%-2d ", TS.index(T),
+ clang::tok::getTokenName(T.Kind), T.Line, T.Indent);
+ OS << '"';
+ llvm::printEscapedString(T.text(), OS);
+ OS << '"';
+ if (T.Flags)
+ OS << llvm::format(" flags=%x", T.Flags);
+ OS << '\n';
+ }
+ return OS;
+}
+
+llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const Token::Range &R) {
+ OS << llvm::formatv("[{0},{1})", R.Begin, R.End);
+ return OS;
+}
+
+TokenStream::TokenStream(std::shared_ptr<void> Payload)
+ : Payload(std::move(Payload)) {
+ Storage.emplace_back();
+ Storage.back().Kind = clang::tok::eof;
+}
+
+void TokenStream::finalize() {
+ assert(!isFinalized());
+ unsigned LastLine = Storage.back().Line;
+ Storage.emplace_back();
+ Storage.back().Kind = tok::eof;
+ Storage.back().Line = LastLine + 1;
+
+ Tokens = Storage;
+ Tokens = Tokens.drop_front().drop_back();
+}
+
+bool TokenStream::isFinalized() const {
+ assert(!Storage.empty() && Storage.front().Kind == tok::eof);
+ if (Storage.size() == 1)
+ return false;
+ return Storage.back().Kind == tok::eof;
+}
+
+void TokenStream::print(llvm::raw_ostream &OS) const {
+ bool FirstToken = true;
+ unsigned LastLine = -1;
+ StringRef LastText;
+ for (const auto &T : tokens()) {
+ StringRef Text = T.text();
+ if (FirstToken) {
+ FirstToken = false;
+ } else if (T.Line == LastLine) {
+ if (LastText.data() + LastText.size() != Text.data())
+ OS << ' ';
+ } else {
+ OS << '\n';
+ OS.indent(T.Indent);
+ }
+ OS << Text;
+ LastLine = T.Line;
+ LastText = Text;
+ }
+ if (!FirstToken)
+ OS << '\n';
+}
+
+TokenStream stripComments(const TokenStream &Input) {
+ TokenStream Out;
+ for (const Token &T : Input.tokens()) {
+ if (T.Kind == tok::comment)
+ continue;
+ Out.push(T);
+ }
+ Out.finalize();
+ return Out;
+}
+
+} // namespace pseudo
+} // namespace clang
--- /dev/null
+# This is a C++ grammar from the C++ standard [1].
+#
+# The grammar is a superset of the true grammar requring semantic constraints to
+# resolve ambiguties. The grammar is context-free and ambiguous (beyond the
+# limit of LR(k)). We use general parsing algorithm (e.g GLR) to handle the
+# grammar and generate a transition table which is used to drive the parsing.
+#
+# It aims to align with the ISO C++ grammar as much as possible. We adjust it
+# to fit the need for the grammar-based parser:
+# - attributes are omitted, which will be handled as comments;
+# - we don't allow nullable non-terminal symbols. There are few nullable
+# non-terminals in the spec grammar, they are adjusted to be non-nullable;
+# - the file merely describes the core C++ grammar. Preprocessor directives and
+# lexical conversions are omitted as we reuse clang's lexer and run a fake
+# preprocessor;
+#
+# Guidelines:
+# - non-terminals are lower_case; terminals (aka tokens) correspond to
+# clang::TokenKind, written as "IDENTIFIER", "USING", "::" etc;
+# - optional symbols are supported, with a _opt suffix;
+#
+# [1] https://isocpp.org/files/papers/N4860.pdf
+#
+#
+# _ serves as a "fake" start symbol, coming with real grammar symbols.
+_ := translation-unit
+
+# gram.key
+typedef-name := IDENTIFIER
+typedef-name := simple-template-id
+namespace-name := IDENTIFIER
+namespace-name := namespace-alias
+namespace-alias := IDENTIFIER
+class-name := IDENTIFIER
+class-name := simple-template-id
+enum-name := IDENTIFIER
+template-name := IDENTIFIER
+
+# gram.basic
+#! Custom modifications to eliminate optional declaration-seq
+translation-unit := declaration-seq
+translation-unit := global-module-fragment_opt module-declaration declaration-seq_opt private-module-fragment_opt
+
+# gram.expr
+# expr.prim
+primary-expression := literal
+primary-expression := THIS
+primary-expression := ( expression )
+primary-expression := id-expression
+primary-expression := lambda-expression
+primary-expression := fold-expression
+primary-expression := requires-expression
+id-expression := unqualified-id
+id-expression := qualified-id
+unqualified-id := IDENTIFIER
+unqualified-id := operator-function-id
+unqualified-id := conversion-function-id
+unqualified-id := literal-operator-id
+unqualified-id := ~ type-name
+unqualified-id := ~ decltype-specifier
+unqualified-id := template-id
+qualified-id := nested-name-specifier TEMPLATE_opt unqualified-id
+nested-name-specifier := ::
+nested-name-specifier := type-name ::
+nested-name-specifier := namespace-name ::
+nested-name-specifier := decltype-specifier ::
+nested-name-specifier := nested-name-specifier IDENTIFIER ::
+nested-name-specifier := nested-name-specifier TEMPLATE_opt simple-template-id ::
+lambda-expression := lambda-introducer lambda-declarator_opt compound-statement
+lambda-expression := lambda-introducer < template-parameter-list > requires-clause_opt lambda-declarator_opt compound-statement
+lambda-introducer := [ lambda-capture_opt ]
+lambda-declarator := ( parameter-declaration-clause_opt ) decl-specifier-seq_opt noexcept-specifier_opt trailing-return-type_opt requires-clause_opt
+lambda-capture := capture-default
+lambda-capture := capture-list
+lambda-capture := capture-default , capture-list
+capture-default := &
+capture-default := =
+capture-list := capture
+capture-list := capture-list , capture
+capture := simple-capture
+capture := init-capture
+simple-capture := IDENTIFIER ..._opt
+simple-capture := & IDENTIFIER ..._opt
+simple-capture := THIS
+simple-capture := * THIS
+init-capture := ..._opt IDENTIFIER initializer
+init-capture := & ..._opt IDENTIFIER initializer
+fold-expression := ( cast-expression fold-operator ... )
+fold-expression := ( ... fold-operator cast-expression )
+fold-expression := ( cast-expression fold-operator ... fold-operator cast-expression )
+fold-operator := +
+fold-operator := -
+fold-operator := *
+fold-operator := /
+fold-operator := %
+fold-operator := ^
+fold-operator := |
+fold-operator := <<
+fold-operator := >>
+fold-operator := +=
+fold-operator := -=
+fold-operator := *=
+fold-operator := /=
+fold-operator := %=
+fold-operator := ^=
+fold-operator := &=
+fold-operator := |=
+fold-operator := <<=
+fold-operator := >>=
+fold-operator := =
+fold-operator := ==
+fold-operator := !=
+fold-operator := <
+fold-operator := >
+fold-operator := <=
+fold-operator := >=
+fold-operator := &&
+fold-operator := ||
+fold-operator := ,
+fold-operator := .*
+fold-operator := ->*
+requires-expression := REQUIRES requirement-parameter-list_opt requirement-body
+requirement-parameter-list := ( parameter-declaration-clause_opt )
+requirement-body := { requirement-seq }
+requirement-seq := requirement
+requirement-seq := requirement-seq requirement
+requirement := simple-requirement
+requirement := type-requirement
+requirement := compound-requirement
+requirement := nested-requirement
+simple-requirement := expression ;
+type-requirement := TYPENAME nested-name-specifier_opt type-name ;
+compound-requirement := { expression } NOEXCEPT_opt return-type-requirement_opt ;
+return-type-requirement := -> type-constraint
+nested-requirement := REQUIRES constraint-expression ;
+# expr.post
+postfix-expression := primary-expression
+postfix-expression := postfix-expression [ expr-or-braced-init-list ]
+postfix-expression := postfix-expression ( expression-list_opt )
+postfix-expression := simple-type-specifier ( expression-list_opt )
+postfix-expression := typename-specifier ( expression-list_opt )
+postfix-expression := simple-type-specifier braced-init-list
+postfix-expression := postfix-expression . TEMPLATE_opt id-expression
+postfix-expression := postfix-expression -> TEMPLATE_opt id-expression
+postfix-expression := postfix-expression ++
+postfix-expression := postfix-expression --
+postfix-expression := DYNAMIC_CAST < type-id > ( expression )
+postfix-expression := STATIC_CAST < type-id > ( expression )
+postfix-expression := REINTERPRET_CAST < type-id > ( expression )
+postfix-expression := CONST_CAST < type-id > ( expression )
+postfix-expression := TYPEID ( expression )
+postfix-expression := TYPEID ( type-id )
+expression-list := initializer-list
+# expr.unary
+unary-expression := postfix-expression
+unary-expression := unary-operator cast-expression
+unary-expression := ++ cast-expression
+unary-expression := -- cast-expression
+unary-expression := await-expression
+unary-expression := SIZEOF unary-expression
+unary-expression := SIZEOF ( type-id )
+unary-expression := SIZEOF ... ( IDENTIFIER )
+unary-expression := ALIGNOF ( type-id )
+unary-expression := noexcept-expression
+unary-expression := new-expression
+unary-expression := delete-expression
+unary-operator := *
+unary-operator := &
+unary-operator := +
+unary-operator := -
+unary-operator := !
+unary-operator := ~
+await-expression := CO_AWAIT cast-expression
+noexcept-expression := NOEXCEPT ( expression )
+new-expression := ::_opt NEW new-placement_opt new-type-id new-initializer_opt
+new-expression := ::_opt NEW new-placement_opt ( type-id ) new-initializer_opt
+new-placement := ( expression-list )
+new-type-id := type-specifier-seq new-declarator_opt
+new-declarator := ptr-operator new-declarator_opt
+new-declarator := noptr-new-declarator
+noptr-new-declarator := [ expression_opt ]
+noptr-new-declarator := noptr-new-declarator [ constant-expression ]
+new-initializer := ( expression-list_opt )
+new-initializer := braced-init-list
+delete-expression := ::_opt DELETE cast-expression
+delete-expression := ::_opt DELETE [ ] cast-expression
+cast-expression := unary-expression
+cast-expression := ( type-id ) cast-expression
+# expr.mptr.oper
+pm-expression := cast-expression
+pm-expression := pm-expression .* cast-expression
+pm-expression := pm-expression ->* cast-expression
+# expr.mul
+multiplicative-expression := pm-expression
+multiplicative-expression := multiplicative-expression * pm-expression
+multiplicative-expression := multiplicative-expression / pm-expression
+multiplicative-expression := multiplicative-expression % pm-expression
+# expr.add
+additive-expression := multiplicative-expression
+additive-expression := additive-expression + multiplicative-expression
+additive-expression := additive-expression - multiplicative-expression
+# expr.shift
+shift-expression := additive-expression
+shift-expression := shift-expression << additive-expression
+shift-expression := shift-expression >> additive-expression
+# expr.spaceship
+compare-expression := shift-expression
+compare-expression := compare-expression <=> shift-expression
+# expr.rel
+relational-expression := compare-expression
+relational-expression := relational-expression < compare-expression
+relational-expression := relational-expression > compare-expression
+relational-expression := relational-expression <= compare-expression
+relational-expression := relational-expression >= compare-expression
+# expr.eq
+equality-expression := relational-expression
+equality-expression := equality-expression == relational-expression
+equality-expression := equality-expression != relational-expression
+# expr.bit.and
+and-expression := equality-expression
+and-expression := and-expression & equality-expression
+# expr.xor
+exclusive-or-expression := and-expression
+exclusive-or-expression := exclusive-or-expression ^ and-expression
+# expr.or
+inclusive-or-expression := exclusive-or-expression
+inclusive-or-expression := inclusive-or-expression | exclusive-or-expression
+# expr.log.and
+logical-and-expression := inclusive-or-expression
+logical-and-expression := logical-and-expression && inclusive-or-expression
+# expr.log.or
+logical-or-expression := logical-and-expression
+logical-or-expression := logical-or-expression || logical-and-expression
+# expr.cond
+conditional-expression := logical-or-expression
+conditional-expression := logical-or-expression ? expression : assignment-expression
+# expr.ass
+yield-expression := CO_YIELD assignment-expression
+yield-expression := CO_YIELD braced-init-list
+throw-expression := THROW assignment-expression_opt
+assignment-expression := conditional-expression
+assignment-expression := yield-expression
+assignment-expression := throw-expression
+assignment-expression := logical-or-expression assignment-operator initializer-clause
+assignment-operator := =
+assignment-operator := *=
+assignment-operator := /=
+assignment-operator := %=
+assignment-operator := +=
+assignment-operator := -=
+assignment-operator := >>=
+assignment-operator := <<=
+assignment-operator := &=
+assignment-operator := ^=
+assignment-operator := |=
+# expr.comma
+expression := assignment-expression
+expression := expression , assignment-expression
+# expr.const
+constant-expression := conditional-expression
+
+# gram.stmt
+statement := labeled-statement
+statement := expression-statement
+statement := compound-statement
+statement := selection-statement
+statement := iteration-statement
+statement := jump-statement
+statement := declaration-statement
+statement := try-block
+init-statement := expression-statement
+init-statement := simple-declaration
+condition := expression
+condition := decl-specifier-seq declarator brace-or-equal-initializer
+labeled-statement := IDENTIFIER : statement
+labeled-statement := CASE constant-expression : statement
+labeled-statement := DEFAULT : statement
+expression-statement := expression_opt ;
+compound-statement := { statement-seq_opt }
+statement-seq := statement
+statement-seq := statement-seq statement
+selection-statement := IF CONSTEXPR_opt ( init-statement_opt condition ) statement
+selection-statement := IF CONSTEXPR_opt ( init-statement_opt condition ) statement ELSE statement
+selection-statement := SWITCH ( init-statement_opt condition ) statement
+iteration-statement := WHILE ( condition ) statement
+iteration-statement := DO statement WHILE ( expression ) ;
+iteration-statement := FOR ( init-statement condition_opt ; expression_opt ) statement
+iteration-statement := FOR ( init-statement_opt for-range-declaration : for-range-initializer ) statement
+for-range-declaration := decl-specifier-seq declarator
+for-range-declaration := decl-specifier-seq ref-qualifier_opt [ identifier-list ]
+for-range-initializer := expr-or-braced-init-list
+jump-statement := BREAK ;
+jump-statement := CONTINUE ;
+jump-statement := RETURN expr-or-braced-init-list_opt ;
+jump-statement := coroutine-return-statement
+jump-statement := GOTO IDENTIFIER ;
+coroutine-return-statement := CO_RETURN expr-or-braced-init-list_opt ;
+declaration-statement := block-declaration
+
+# gram.dcl
+declaration-seq := declaration
+declaration-seq := declaration-seq declaration
+declaration := block-declaration
+declaration := nodeclspec-function-declaration
+declaration := function-definition
+declaration := template-declaration
+declaration := deduction-guide
+declaration := explicit-instantiation
+declaration := explicit-specialization
+declaration := export-declaration
+declaration := linkage-specification
+declaration := namespace-definition
+declaration := empty-declaration
+declaration := module-import-declaration
+block-declaration := simple-declaration
+block-declaration := asm-declaration
+block-declaration := namespace-alias-definition
+block-declaration := using-declaration
+block-declaration := using-enum-declaration
+block-declaration := using-directive
+block-declaration := static_assert-declaration
+block-declaration := alias-declaration
+block-declaration := opaque-enum-declaration
+nodeclspec-function-declaration := declarator ;
+alias-declaration := USING IDENTIFIER = defining-type-id ;
+simple-declaration := decl-specifier-seq init-declarator-list_opt ;
+simple-declaration := decl-specifier-seq ref-qualifier_opt [ identifier-list ] initializer ;
+static_assert-declaration := STATIC_ASSERT ( constant-expression ) ;
+static_assert-declaration := STATIC_ASSERT ( constant-expression , string-literal ) ;
+empty-declaration := ;
+# dcl.spec
+decl-specifier := storage-class-specifier
+decl-specifier := defining-type-specifier
+decl-specifier := function-specifier
+decl-specifier := FRIEND
+decl-specifier := TYPEDEF
+decl-specifier := CONSTEXPR
+decl-specifier := CONSTEVAL
+decl-specifier := CONSTINIT
+decl-specifier := INLINE
+decl-specifier-seq := decl-specifier
+decl-specifier-seq := decl-specifier decl-specifier-seq
+storage-class-specifier := STATIC
+storage-class-specifier := THREAD_LOCAL
+storage-class-specifier := EXTERN
+storage-class-specifier := MUTABLE
+function-specifier := VIRTUAL
+function-specifier := explicit-specifier
+explicit-specifier := EXPLICIT ( constant-expression )
+explicit-specifier := EXPLICIT
+type-specifier := simple-type-specifier
+type-specifier := elaborated-type-specifier
+type-specifier := typename-specifier
+type-specifier := cv-qualifier
+type-specifier-seq := type-specifier
+type-specifier-seq := type-specifier type-specifier-seq
+defining-type-specifier := type-specifier
+defining-type-specifier := class-specifier
+defining-type-specifier := enum-specifier
+defining-type-specifier-seq := defining-type-specifier
+defining-type-specifier-seq := defining-type-specifier defining-type-specifier-seq
+simple-type-specifier := nested-name-specifier_opt type-name
+simple-type-specifier := nested-name-specifier TEMPLATE simple-template-id
+simple-type-specifier := decltype-specifier
+simple-type-specifier := placeholder-type-specifier
+simple-type-specifier := nested-name-specifier_opt template-name
+simple-type-specifier := CHAR
+simple-type-specifier := CHAR8_T
+simple-type-specifier := CHAR16_T
+simple-type-specifier := CHAR32_T
+simple-type-specifier := WCHAR_T
+simple-type-specifier := BOOL
+simple-type-specifier := SHORT
+simple-type-specifier := INT
+simple-type-specifier := LONG
+simple-type-specifier := SIGNED
+simple-type-specifier := UNSIGNED
+simple-type-specifier := FLOAT
+simple-type-specifier := DOUBLE
+simple-type-specifier := VOID
+type-name := class-name
+type-name := enum-name
+type-name := typedef-name
+elaborated-type-specifier := class-key nested-name-specifier_opt IDENTIFIER
+elaborated-type-specifier := class-key simple-template-id
+elaborated-type-specifier := class-key nested-name-specifier TEMPLATE_opt simple-template-id
+elaborated-type-specifier := elaborated-enum-specifier
+elaborated-enum-specifier := ENUM nested-name-specifier_opt IDENTIFIER
+decltype-specifier := DECLTYPE ( expression )
+placeholder-type-specifier := type-constraint_opt AUTO
+placeholder-type-specifier := type-constraint_opt DECLTYPE ( AUTO )
+init-declarator-list := init-declarator
+init-declarator-list := init-declarator-list , init-declarator
+init-declarator := declarator initializer_opt
+init-declarator := declarator requires-clause
+declarator := ptr-declarator
+declarator := noptr-declarator parameters-and-qualifiers trailing-return-type
+ptr-declarator := noptr-declarator
+ptr-declarator := ptr-operator ptr-declarator
+noptr-declarator := declarator-id
+noptr-declarator := noptr-declarator parameters-and-qualifiers
+noptr-declarator := noptr-declarator [ constant-expression_opt ]
+noptr-declarator := ( ptr-declarator )
+parameters-and-qualifiers := ( parameter-declaration-list_opt ) cv-qualifier-seq_opt ref-qualifier_opt noexcept-specifier_opt
+trailing-return-type := -> type-id
+ptr-operator := * cv-qualifier-seq_opt
+ptr-operator := &
+ptr-operator := &&
+ptr-operator := nested-name-specifier * cv-qualifier-seq_opt
+cv-qualifier-seq := cv-qualifier cv-qualifier-seq_opt
+cv-qualifier := CONST
+cv-qualifier := VOLATILE
+ref-qualifier := &
+ref-qualifier := &&
+declarator-id := ..._opt id-expression
+type-id := type-specifier-seq abstract-declarator_opt
+defining-type-id := defining-type-specifier-seq abstract-declarator_opt
+abstract-declarator := ptr-abstract-declarator
+abstract-declarator := noptr-abstract-declarator_opt parameters-and-qualifiers trailing-return-type
+abstract-declarator := abstract-pack-declarator
+ptr-abstract-declarator := noptr-abstract-declarator
+ptr-abstract-declarator := ptr-operator ptr-abstract-declarator_opt
+noptr-abstract-declarator := noptr-abstract-declarator_opt parameters-and-qualifiers
+noptr-abstract-declarator := noptr-abstract-declarator_opt [ constant-expression ]
+noptr-abstract-declarator := ( ptr-abstract-declarator )
+abstract-pack-declarator := noptr-abstract-pack-declarator
+abstract-pack-declarator := ptr-operator abstract-pack-declarator
+noptr-abstract-pack-declarator := noptr-abstract-pack-declarator parameters-and-qualifiers
+noptr-abstract-pack-declarator := noptr-abstract-pack-declarator [ constant-expression_opt ]
+noptr-abstract-pack-declarator := ...
+#! Custom modifications to avoid nullable clause.
+parameter-declaration-clause := parameter-declaration-list
+parameter-declaration-clause := parameter-declaration-list_opt ...
+parameter-declaration-clause := parameter-declaration-list , ...
+parameter-declaration-list := parameter-declaration
+parameter-declaration-list := parameter-declaration-list , parameter-declaration
+parameter-declaration := decl-specifier-seq declarator
+parameter-declaration := decl-specifier-seq declarator = initializer-clause
+parameter-declaration := decl-specifier-seq abstract-declarator_opt
+parameter-declaration := decl-specifier-seq abstract-declarator_opt = initializer-clause
+# dcl.init
+initializer := brace-or-equal-initializer
+initializer := ( expression-list )
+brace-or-equal-initializer := = initializer-clause
+brace-or-equal-initializer := braced-init-list
+initializer-clause := assignment-expression
+initializer-clause := braced-init-list
+braced-init-list := { initializer-list ,_opt }
+braced-init-list := { designated-initializer-list ,_opt }
+braced-init-list := { }
+initializer-list := initializer-clause ..._opt
+initializer-list := initializer-list , initializer-clause ..._opt
+designated-initializer-list := designated-initializer-clause
+designated-initializer-list := designated-initializer-list , designated-initializer-clause
+designated-initializer-clause := designator brace-or-equal-initializer
+designator := . IDENTIFIER
+expr-or-braced-init-list := expression
+expr-or-braced-init-list := braced-init-list
+# dcl.fct
+function-definition := decl-specifier-seq_opt declarator virt-specifier-seq_opt function-body
+function-definition := decl-specifier-seq_opt declarator requires-clause function-body
+function-body := ctor-initializer_opt compound-statement
+function-body := function-try-block
+function-body := = DEFAULT ;
+function-body := = DELETE ;
+# dcl.enum
+enum-specifier := enum-head { enumerator-list_opt }
+enum-specifier := enum-head { enumerator-list , }
+enum-head := enum-key enum-head-name_opt enum-base_opt
+enum-head-name := nested-name-specifier_opt IDENTIFIER
+opaque-enum-declaration := enum-key enum-head-name enum-base_opt ;
+enum-key := ENUM
+enum-key := ENUM CLASS
+enum-key := ENUM STRUCT
+enum-base := : type-specifier-seq
+enumerator-list := enumerator-definition
+enumerator-list := enumerator-list , enumerator-definition
+enumerator-definition := enumerator
+enumerator-definition := enumerator = constant-expression
+enumerator := IDENTIFIER
+using-enum-declaration := USING elaborated-enum-specifier ;
+# basic.namespace
+namespace-definition := named-namespace-definition
+namespace-definition := unnamed-namespace-definition
+namespace-definition := nested-namespace-definition
+named-namespace-definition := INLINE_opt NAMESPACE IDENTIFIER { namespace-body_opt }
+unnamed-namespace-definition := INLINE_opt NAMESPACE { namespace-body_opt }
+nested-namespace-definition := NAMESPACE enclosing-namespace-specifier :: INLINE_opt IDENTIFIER { namespace-body }
+enclosing-namespace-specifier := IDENTIFIER
+enclosing-namespace-specifier := enclosing-namespace-specifier :: INLINE_opt IDENTIFIER
+#! Custom modification to avoid nullable namespace-body.
+namespace-body := declaration-seq
+namespace-alias-definition := NAMESPACE IDENTIFIER = qualified-namespace-specifier ;
+qualified-namespace-specifier := nested-name-specifier_opt namespace-name
+using-directive := USING NAMESPACE nested-name-specifier_opt namespace-name ;
+using-declaration := USING using-declarator-list ;
+using-declarator-list := using-declarator ..._opt
+using-declarator-list := using-declarator-list , using-declarator ..._opt
+using-declarator := TYPENAME_opt nested-name-specifier unqualified-id
+# dcl.asm
+asm-declaration := ASM ( string-literal ) ;
+# dcl.link
+linkage-specification := EXTERN string-literal { declaration-seq_opt }
+linkage-specification := EXTERN string-literal declaration
+
+# gram.module
+module-declaration := export-keyword_opt module-keyword module-name module-partition_opt
+module-name := module-name-qualifier_opt IDENTIFIER
+module-partition := : module-name-qualifier_opt IDENTIFIER
+module-name-qualifier := IDENTIFIER .
+module-name-qualifier := module-name-qualifier IDENTIFIER .
+export-declaration := EXPORT declaration
+export-declaration := EXPORT ( declaration-seq_opt )
+export-declaration := export-keyword module-import-declaration
+module-import-declaration := import-keyword module-name
+module-import-declaration := import-keyword module-partition
+# FIXME: we don't have header-name in the grammar. Handle these in PP?
+# module-import-declaration := import-keyword header-name
+global-module-fragment := module-keyword ; declaration-seq_opt
+private-module-fragment := module-keyword : PRIVATE ; declaration-seq_opt
+
+# gram.class
+class-specifier := class-head { member-specification_opt }
+class-head := class-key class-head-name class-virt-specifier_opt base-clause_opt
+class-head := class-key base-clause_opt
+class-head-name := nested-name-specifier_opt class-name
+class-virt-specifier := contextual-final
+class-key := CLASS
+class-key := STRUCT
+class-key := UNION
+member-specification := member-declaration member-specification_opt
+member-specification := access-specifier : member-declaration member-specification_opt
+member-declaration := decl-specifier-seq_opt member-declarator-list_opt ;
+member-declaration := function-definition
+member-declaration := using-declaration
+member-declaration := using-enum-declaration
+member-declaration := static_assert-declaration
+member-declaration := template-declaration
+member-declaration := explicit-specialization
+member-declaration := deduction-guide
+member-declaration := alias-declaration
+member-declaration := opaque-enum-declaration
+member-declaration := empty-declaration
+member-declarator-list := member-declarator
+member-declarator-list := member-declarator-list , member-declarator
+member-declarator := declarator virt-specifier-seq_opt pure-specifier_opt
+member-declarator := declarator requires-clause
+member-declarator := declarator brace-or-equal-initializer
+member-declarator := IDENTIFIER_opt : constant-expression brace-or-equal-initializer_opt
+virt-specifier-seq := virt-specifier
+virt-specifier-seq := virt-specifier-seq virt-specifier
+virt-specifier := contextual-override
+virt-specifier := contextual-final
+pure-specifier := = contextual-zero
+conversion-function-id := OPERATOR conversion-type-id
+conversion-type-id := type-specifier-seq conversion-declarator_opt
+conversion-declarator := ptr-operator conversion-declarator_opt
+base-clause := : base-specifier-list
+base-specifier-list := base-specifier ..._opt
+base-specifier-list := base-specifier-list , base-specifier ..._opt
+base-specifier := class-or-decltype
+base-specifier := VIRTUAL access-specifier_opt class-or-decltype
+base-specifier := access-specifier VIRTUAL_opt class-or-decltype
+class-or-decltype := nested-name-specifier_opt type-name
+class-or-decltype := nested-name-specifier TEMPLATE simple-template-id
+class-or-decltype := decltype-specifier
+access-specifier := PRIVATE
+access-specifier := PROTECTED
+access-specifier := PUBLIC
+ctor-initializer := : mem-initializer-list
+mem-initializer-list := mem-initializer ..._opt
+mem-initializer-list := mem-initializer-list , mem-initializer ..._opt
+mem-initializer := mem-initializer-id ( expression-list_opt )
+mem-initializer := mem-initializer-id braced-init-list
+mem-initializer-id := class-or-decltype
+mem-initializer-id := IDENTIFIER
+
+# gram.over
+operator-function-id := OPERATOR operator-name
+operator-name := NEW
+operator-name := DELETE
+operator-name := NEW [ ]
+operator-name := DELETE [ ]
+operator-name := CO_AWAIT
+operator-name := ( )
+operator-name := [ ]
+operator-name := ->
+operator-name := ->*
+operator-name := ~
+operator-name := !
+operator-name := +
+operator-name := -
+operator-name := *
+operator-name := /
+operator-name := %
+operator-name := ^
+operator-name := &
+operator-name := |
+operator-name := =
+operator-name := +=
+operator-name := -=
+operator-name := *=
+operator-name := /=
+operator-name := %=
+operator-name := ^=
+operator-name := &=
+operator-name := |=
+operator-name := ==
+operator-name := !=
+operator-name := <
+operator-name := >
+operator-name := <=
+operator-name := >=
+operator-name := <=>
+operator-name := ^^
+operator-name := ||
+operator-name := <<
+operator-name := >>
+operator-name := <<=
+operator-name := >>=
+operator-name := ++
+operator-name := --
+operator-name := ,
+literal-operator-id := OPERATOR string-literal IDENTIFIER
+literal-operator-id := OPERATOR user-defined-string-literal
+
+# gram.temp
+template-declaration := template-head declaration
+template-declaration := template-head concept-definition
+template-head := TEMPLATE < template-parameter-list > requires-clause_opt
+template-parameter-list := template-parameter
+template-parameter-list := template-parameter-list , template-parameter
+requires-clause := REQUIRES constraint-logical-or-expression
+constraint-logical-or-expression := constraint-logical-and-expression
+constraint-logical-or-expression := constraint-logical-or-expression || constraint-logical-and-expression
+constraint-logical-and-expression := primary-expression
+constraint-logical-and-expression := constraint-logical-and-expression && primary-expression
+template-parameter := type-parameter
+template-parameter := parameter-declaration
+type-parameter := type-parameter-key ..._opt IDENTIFIER
+type-parameter := type-parameter-key IDENTIFIER_opt = type-id
+type-parameter := type-constraint ..._opt IDENTIFIER_opt
+type-parameter := type-constraint IDENTIFIER_opt = type-id
+type-parameter := template-head type-parameter-key ..._opt IDENTIFIER_opt
+type-parameter := template-head type-parameter-key IDENTIFIER_opt = id-expression
+type-parameter-key := CLASS
+type-parameter-key := TYPENAME
+type-constraint := nested-name-specifier_opt concept-name
+type-constraint := nested-name-specifier_opt concept-name < template-argument-list_opt >
+simple-template-id := template-name < template-argument-list_opt >
+template-id := simple-template-id
+template-id := operator-function-id < template-argument-list_opt >
+template-id := literal-operator-id < template-argument-list_opt >
+template-argument-list := template-argument ..._opt
+template-argument-list := template-argument-list , template-argument ..._opt
+template-argument := constant-expression
+template-argument := type-id
+template-argument := id-expression
+constraint-expression := logical-or-expression
+deduction-guide := explicit-specifier_opt template-name ( parameter-declaration-list_opt ) -> simple-template-id ;
+concept-definition := CONCEPT concept-name = constraint-expression ;
+concept-name := IDENTIFIER
+typename-specifier := TYPENAME nested-name-specifier IDENTIFIER
+typename-specifier := TYPENAME nested-name-specifier TEMPLATE_opt simple-template-id
+explicit-instantiation := EXTERN_opt TEMPLATE declaration
+explicit-specialization := TEMPLATE < > declaration
+
+# gram.except
+try-block := TRY compound-statement handler-seq
+function-try-block := TRY ctor-initializer_opt compound-statement handler-seq
+handler-seq := handler handler-seq_opt
+handler := CATCH ( exception-declaration ) compound-statement
+exception-declaration := type-specifier-seq declarator
+exception-declaration := type-specifier-seq abstract-declarator_opt
+noexcept-specifier := NOEXCEPT ( constant-expression )
+noexcept-specifier := NOEXCEPT
+
+# gram.cpp
+identifier-list := IDENTIFIER
+identifier-list := identifier-list , IDENTIFIER
+
+# gram.lex
+#! As we use clang lexer, most of lexical symbols are not needed, we only add
+#! needed literals.
+literal := integer-literal
+literal := character-literal
+literal := floating-point-literal
+literal := string-literal
+literal := boolean-literal
+literal := pointer-literal
+literal := user-defined-literal
+integer-literal := NUMERIC_CONSTANT
+character-literal := CHAR_CONSTANT
+character-literal := WIDE_CHAR_CONSTANT
+character-literal := UTF8_CHAR_CONSTANT
+character-literal := UTF16_CHAR_CONSTANT
+character-literal := UTF32_CHAR_CONSTANT
+floating-point-literal := NUMERIC_CONSTANT
+string-literal-chunk := STRING_LITERAL
+string-literal-chunk := WIDE_STRING_LITERAL
+string-literal-chunk := UTF8_STRING_LITERAL
+string-literal-chunk := UTF16_STRING_LITERAL
+string-literal-chunk := UTF32_STRING_LITERAL
+#! Technically, string concatenation happens at phase 6 which is before parsing,
+#! so it doesn't belong to the grammar. However, we extend the grammar to
+#! support it, to make the pseudo parser fully functional on practical code.
+string-literal := string-literal-chunk
+string-literal := string-literal string-literal-chunk
+user-defined-literal := user-defined-integer-literal
+user-defined-literal := user-defined-floating-point-literal
+user-defined-literal := user-defined-string-literal
+user-defined-literal := user-defined-character-literal
+user-defined-integer-literal := NUMERIC_CONSTANT
+user-defined-string-literal-chunk := STRING_LITERAL
+user-defined-string-literal-chunk := WIDE_STRING_LITERAL
+user-defined-string-literal-chunk := UTF8_STRING_LITERAL
+user-defined-string-literal-chunk := UTF16_STRING_LITERAL
+user-defined-string-literal-chunk := UTF32_STRING_LITERAL
+user-defined-string-literal := user-defined-string-literal-chunk
+user-defined-string-literal := string-literal-chunk user-defined-string-literal
+user-defined-string-literal := user-defined-string-literal string-literal-chunk
+user-defined-floating-point-literal := NUMERIC_CONSTANT
+user-defined-character-literal := CHAR_CONSTANT
+user-defined-character-literal := WIDE_CHAR_CONSTANT
+user-defined-character-literal := UTF8_CHAR_CONSTANT
+user-defined-character-literal := UTF16_CHAR_CONSTANT
+user-defined-character-literal := UTF32_CHAR_CONSTANT
+boolean-literal := FALSE
+boolean-literal := TRUE
+pointer-literal := NULLPTR
+
+#! Contextual keywords -- clang lexer always lexes them as identifier tokens.
+#! Placeholders for literal text in the grammar that lex as other things.
+contextual-override := IDENTIFIER
+contextual-final := IDENTIFIER
+contextual-zero := NUMERIC_CONSTANT
+module-keyword := IDENTIFIER
+import-keyword := IDENTIFIER
+export-keyword := IDENTIFIER
--- /dev/null
+# Set CLANG_TOOLS_DIR to buildtree/bin, or buildtree/%(build_mode)s/bin if the
+# location is dynamic. The latter must be interpolated by lit configs.
+# FIXME: this is duplicated in many places.
+if (CMAKE_CFG_INTDIR STREQUAL ".")
+ set(LLVM_BUILD_MODE ".")
+else ()
+ set(LLVM_BUILD_MODE "%(build_mode)s")
+endif ()
+string(REPLACE ${CMAKE_CFG_INTDIR} ${LLVM_BUILD_MODE} CLANG_TOOLS_DIR ${LLVM_RUNTIME_OUTPUT_INTDIR})
+
+set(CLANG_PSEUDO_TEST_DEPS
+ clang-pseudo
+ ClangPseudoTests
+ )
+
+foreach(dep FileCheck not)
+ if(TARGET ${dep})
+ list(APPEND CLANG_PSEUDO_TEST_DEPS ${dep})
+ endif()
+endforeach()
+
+configure_lit_site_cfg(
+ ${CMAKE_CURRENT_SOURCE_DIR}/lit.site.cfg.py.in
+ ${CMAKE_CURRENT_BINARY_DIR}/lit.site.cfg.py
+ MAIN_CONFIG
+ ${CMAKE_CURRENT_SOURCE_DIR}/lit.cfg.py
+ )
+
+configure_lit_site_cfg(
+ ${CMAKE_CURRENT_SOURCE_DIR}/Unit/lit.site.cfg.py.in
+ ${CMAKE_CURRENT_BINARY_DIR}/Unit/lit.site.cfg.py
+ MAIN_CONFIG
+ ${CMAKE_CURRENT_SOURCE_DIR}/Unit/lit.cfg.py
+ )
+
+add_lit_testsuite(check-clang-pseudo "Running the clang-pseudo regression tests"
+ ${CMAKE_CURRENT_BINARY_DIR}
+ DEPENDS ${CLANG_PSEUDO_TEST_DEPS})
--- /dev/null
+import lit.formats
+config.name = "clangPseudo Unit Tests"
+config.test_format = lit.formats.GoogleTest('.', 'Tests')
+config.test_source_root = config.clang_pseudo_binary_dir + "/unittests"
+config.test_exec_root = config.clang_pseudo_binary_dir + "/unittests"
+
+# Point the dynamic loader at dynamic libraries in 'lib'.
+# FIXME: it seems every project has a copy of this logic. Move it somewhere.
+import platform
+if platform.system() == 'Darwin':
+ shlibpath_var = 'DYLD_LIBRARY_PATH'
+elif platform.system() == 'Windows':
+ shlibpath_var = 'PATH'
+else:
+ shlibpath_var = 'LD_LIBRARY_PATH'
+config.environment[shlibpath_var] = os.path.pathsep.join((
+ "@SHLIBDIR@", "@LLVM_LIBS_DIR@",
+ config.environment.get(shlibpath_var,'')))
+
--- /dev/null
+@LIT_SITE_CFG_IN_HEADER@
+# This is a shim to run the gtest unittests in ../unittests using lit.
+
+config.llvm_libs_dir = "@LLVM_LIBS_DIR@"
+config.shlibdir = "@SHLIBDIR@"
+
+config.clang_pseudo_source_dir = "@CMAKE_CURRENT_SOURCE_DIR@/.."
+config.clang_pseudo_binary_dir = "@CMAKE_CURRENT_BINARY_DIR@/.."
+
+# Delegate logic to lit.cfg.py.
+lit_config.load_config(config, "@CMAKE_CURRENT_SOURCE_DIR@/Unit/lit.cfg.py")
--- /dev/null
+// verify clang/lib/Tooling/Syntax/Pseudo/cxx.bnf
+// RUN: clang-pseudo -grammar=%cxx-bnf-file
--- /dev/null
+int is_debug() {
+#ifndef NDEBUG
+ return 1; // in debug mode
+#else
+ return 0;
+#endif
+}
+
+/* This comment gets lexed along with the input above! We just don't CHECK it.
+
+RUN: clang-pseudo -source %s -print-source | FileCheck %s -check-prefix=SOURCE --strict-whitespace
+ SOURCE: int is_debug() {
+SOURCE-NEXT: #ifndef NDEBUG
+SOURCE-NEXT: return 1; // in debug mode
+SOURCE-NEXT: #else
+SOURCE-NEXT: return 0;
+SOURCE-NEXT: #end
+SOURCE-NEXT: }
+
+RUN: clang-pseudo -source %s -print-tokens | FileCheck %s -check-prefix=TOKEN
+ TOKEN: 0: raw_identifier 0:0 "int" flags=1
+TOKEN-NEXT: raw_identifier 0:0 "is_debug"
+TOKEN-NEXT: l_paren 0:0 "("
+TOKEN-NEXT: r_paren 0:0 ")"
+TOKEN-NEXT: l_brace 0:0 "{"
+TOKEN-NEXT: hash 1:0 "#" flags=1
+TOKEN-NEXT: raw_identifier 1:0 "ifndef"
+TOKEN-NEXT: raw_identifier 1:0 "NDEBUG"
+TOKEN-NEXT: raw_identifier 2:2 "return" flags=1
+TOKEN-NEXT: numeric_constant 2:2 "1"
+TOKEN-NEXT: semi 2:2 ";"
+TOKEN-NEXT: comment 2:2 "// in debug mode"
+TOKEN-NEXT: hash 3:0 "#" flags=1
+TOKEN-NEXT: raw_identifier 3:0 "else"
+TOKEN-NEXT: raw_identifier 4:2 "return" flags=1
+TOKEN-NEXT: numeric_constant 4:2 "0"
+TOKEN-NEXT: semi 4:2 ";"
+TOKEN-NEXT: hash 5:0 "#" flags=1
+TOKEN-NEXT: raw_identifier 5:0 "endif"
+TOKEN-NEXT: r_brace 6:0 "}" flags=1
+
+RUN: clang-pseudo -source %s -print-directive-map | FileCheck %s -check-prefix=PPS --strict-whitespace
+ PPS: code (5 tokens)
+PPS-NEXT: #ifndef (3 tokens)
+PPS-NEXT: code (4 tokens)
+PPS-NEXT: #else (2 tokens)
+PPS-NEXT: code (3 tokens)
+PPS-NEXT: #endif (2 tokens)
+PPS-NEXT: code (2 tokens)
+ ^ including this block comment
+
+*******************************************************************************/
--- /dev/null
+import lit.llvm
+
+lit.llvm.initialize(lit_config, config)
+lit.llvm.llvm_config.use_default_substitutions()
+
+config.name = 'ClangPseudo'
+config.suffixes = ['.test', '.c', '.cpp']
+config.excludes = ['Inputs']
+config.test_format = lit.formats.ShTest(not lit.llvm.llvm_config.use_lit_shell)
+config.test_source_root = config.clang_pseudo_source_dir + "/test"
+config.test_exec_root = config.clang_pseudo_binary_dir + "/test"
+
+config.environment['PATH'] = os.path.pathsep.join((
+ config.clang_tools_dir,
+ config.llvm_tools_dir,
+ config.environment['PATH']))
--- /dev/null
+cxx_bnf_file = os.path.join(config.clang_pseudo_source_dir, 'lib', 'cxx.bnf')
+config.substitutions.append(('%cxx-bnf-file', cxx_bnf_file))
--- /dev/null
+@LIT_SITE_CFG_IN_HEADER@
+
+# Variables needed for common llvm config.
+config.clang_tools_dir = "@CLANG_TOOLS_DIR@"
+config.lit_tools_dir = "@LLVM_LIT_TOOLS_DIR@"
+config.llvm_tools_dir = lit_config.substitute("@LLVM_TOOLS_DIR@")
+config.llvm_libs_dir = lit_config.substitute("@LLVM_LIBS_DIR@")
+config.target_triple = "@TARGET_TRIPLE@"
+config.python_executable = "@Python3_EXECUTABLE@"
+
+config.clang_pseudo_source_dir = "@CMAKE_CURRENT_SOURCE_DIR@/.."
+config.clang_pseudo_binary_dir = "@CMAKE_CURRENT_BINARY_DIR@/.."
+# Delegate logic to lit.cfg.py.
+lit_config.load_config(config, "@CMAKE_CURRENT_SOURCE_DIR@/lit.cfg.py")
--- /dev/null
+_ := expr
+expr := id
+id := IDENTIFIER
+
+# RUN: clang-pseudo -grammar %s -print-graph | FileCheck %s --check-prefix=GRAPH
+# GRAPH: States:
+# GRPAH-NEXT: State 0
+# GRPAH-NEXT: _ := • expr
+# GRPAH-NEXT: expr := • id
+# GRPAH-NEXT: id := • IDENTIFIER
+# GRPAH-NEXT: State 1
+# GRPAH-NEXT: _ := expr •
+# GRPAH-NEXT: State 2
+# GRPAH-NEXT: expr := id •
+# GRPAH-NEXT: State 3
+# GRPAH-NEXT: id := IDENTIFIER •
+
+# RUN: clang-pseudo -grammar %s -print-table | FileCheck %s --check-prefix=TABLE
+# TABLE: LRTable:
+# TABLE-NEXT: State 0
+# TABLE-NEXT: 'IDENTIFIER': shift state 3
+# TABLE-NEXT: 'expr': go to state 1
+# TABLE-NEXT: 'id': go to state 2
+# TABLE-NEXT: State 1
+# TABLE-NEXT: 'EOF': accept
+# TABLE-NEXT: State 2
+# TABLE-NEXT: 'EOF': reduce by rule 1 'expr := id'
+# TABLE-NEXT: State 3
+# TABLE-NEXT: 'EOF': reduce by rule 2 'id := IDENTIFIER'
--- /dev/null
+_ := expr
+expr := expr - expr # S/R conflict at state 4 on '-' token
+expr := IDENTIFIER
+
+# RUN: clang-pseudo -grammar %s -print-graph | FileCheck %s --check-prefix=GRAPH
+# GRAPH: States
+# GRAPH-NEXT: State 0
+# GRAPH-NEXT: _ := • expr
+# GRAPH-NEXT: expr := • expr - expr
+# GRAPH-NEXT: expr := • IDENTIFIER
+# GRAPH-NEXT: State 1
+# GRAPH-NEXT: _ := expr •
+# GRAPH-NEXT: expr := expr • - expr
+# GRAPH-NEXT: State 2
+# GRAPH-NEXT: expr := IDENTIFIER •
+# GRAPH-NEXT: State 3
+# GRAPH-NEXT: expr := • expr - expr
+# GRAPH-NEXT: expr := expr - • expr
+# GRAPH-NEXT: expr := • IDENTIFIER
+# GRAPH-NEXT: State 4
+# GRAPH-NEXT: expr := expr - expr •
+# GRAPH-NEXT: expr := expr • - expr
+# GRAPH-NEXT: 0 ->[expr] 1
+# GRAPH-NEXT: 0 ->[IDENTIFIER] 2
+# GRAPH-NEXT: 1 ->[-] 3
+# GRAPH-NEXT: 3 ->[expr] 4
+# GRAPH-NEXT: 3 ->[IDENTIFIER] 2
+# GRAPH-NEXT: 4 ->[-] 3
+
+# RUN: clang-pseudo -grammar %s -print-table | FileCheck %s --check-prefix=TABLE
+# TABLE: LRTable:
+# TABLE-NEXT: State 0
+# TABLE-NEXT: 'IDENTIFIER': shift state 2
+# TABLE-NEXT: 'expr': go to state 1
+# TABLE-NEXT: State 1
+# TABLE-NEXT: 'EOF': accept
+# TABLE-NEXT: '-': shift state 3
+# TABLE-NEXT: State 2
+# TABLE-NEXT: 'EOF': reduce by rule 1 'expr := IDENTIFIER'
+# TABLE-NEXT: '-': reduce by rule 1 'expr := IDENTIFIER'
+# TABLE-NEXT: State 3
+# TABLE-NEXT: 'IDENTIFIER': shift state 2
+# TABLE-NEXT: 'expr': go to state 4
+# TABLE-NEXT: State 4
+# TABLE-NEXT: 'EOF': reduce by rule 2 'expr := expr - expr'
+# TABLE-NEXT: '-': shift state 3
+# TABLE-NEXT: '-': reduce by rule 2 'expr := expr - expr'
--- /dev/null
+set(LLVM_LINK_COMPONENTS support)
+
+add_clang_tool(clang-pseudo
+ ClangPseudo.cpp
+ )
+
+clang_target_link_libraries(clang-pseudo
+ PRIVATE
+ clangBasic
+ )
+
+target_link_libraries(clang-pseudo
+ PRIVATE
+ clangPseudo
+ )
+
--- /dev/null
+//===-- ClangPseudo.cpp - Clang pseudo parser tool ------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang-pseudo/DirectiveMap.h"
+#include "clang-pseudo/Grammar.h"
+#include "clang-pseudo/LRGraph.h"
+#include "clang-pseudo/LRTable.h"
+#include "clang-pseudo/Token.h"
+#include "clang/Basic/LangOptions.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/FormatVariadic.h"
+#include "llvm/Support/MemoryBuffer.h"
+
+using clang::pseudo::Grammar;
+using llvm::cl::desc;
+using llvm::cl::init;
+using llvm::cl::opt;
+
+static opt<std::string>
+ Grammar("grammar", desc("Parse and check a BNF grammar file."), init(""));
+static opt<bool> PrintGrammar("print-grammar", desc("Print the grammar."));
+static opt<bool> PrintGraph("print-graph",
+ desc("Print the LR graph for the grammar"));
+static opt<bool> PrintTable("print-table",
+ desc("Print the LR table for the grammar"));
+static opt<std::string> Source("source", desc("Source file"));
+static opt<bool> PrintSource("print-source", desc("Print token stream"));
+static opt<bool> PrintTokens("print-tokens", desc("Print detailed token info"));
+static opt<bool>
+ PrintDirectiveMap("print-directive-map",
+ desc("Print directive structure of source code"));
+
+static std::string readOrDie(llvm::StringRef Path) {
+ llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> Text =
+ llvm::MemoryBuffer::getFile(Path);
+ if (std::error_code EC = Text.getError()) {
+ llvm::errs() << "Error: can't read grammar file '" << Path
+ << "': " << EC.message() << "\n";
+ ::exit(1);
+ }
+ return Text.get()->getBuffer().str();
+}
+
+int main(int argc, char *argv[]) {
+ llvm::cl::ParseCommandLineOptions(argc, argv, "");
+
+ if (Grammar.getNumOccurrences()) {
+ std::string Text = readOrDie(Grammar);
+ std::vector<std::string> Diags;
+ auto G = Grammar::parseBNF(Text, Diags);
+
+ if (!Diags.empty()) {
+ llvm::errs() << llvm::join(Diags, "\n");
+ return 2;
+ }
+ llvm::outs() << llvm::formatv("grammar file {0} is parsed successfully\n",
+ Grammar);
+ if (PrintGrammar)
+ llvm::outs() << G->dump();
+ if (PrintGraph)
+ llvm::outs() << clang::pseudo::LRGraph::buildLR0(*G).dumpForTests(*G);
+ if (PrintTable)
+ llvm::outs() << clang::pseudo::LRTable::buildSLR(*G).dumpForTests(*G);
+ return 0;
+ }
+
+ if (Source.getNumOccurrences()) {
+ std::string Text = readOrDie(Source);
+ clang::LangOptions LangOpts; // FIXME: use real options.
+ auto Stream = clang::pseudo::lex(Text, LangOpts);
+ auto Structure = clang::pseudo::DirectiveMap::parse(Stream);
+
+ if (PrintDirectiveMap)
+ llvm::outs() << Structure;
+ if (PrintSource)
+ Stream.print(llvm::outs());
+ if (PrintTokens)
+ llvm::outs() << Stream;
+ }
+
+ return 0;
+}
--- /dev/null
+set(LLVM_LINK_COMPONENTS
+ Support
+ )
+
+add_custom_target(ClangPseudoUnitTests)
+add_unittest(ClangPseudoUnitTests ClangPseudoTests
+ DirectiveMapTest.cpp
+ GrammarTest.cpp
+ LRTableTest.cpp
+ TokenTest.cpp
+)
+
+clang_target_link_libraries(ClangPseudoTests
+ PRIVATE
+ clangBasic
+ clangLex
+ clangTesting
+ )
+
+target_link_libraries(ClangPseudoTests
+ PRIVATE
+ clangPseudo
+ )
--- /dev/null
+//===--- DirectiveMapTest.cpp ---------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang-pseudo/DirectiveMap.h"
+
+#include "clang-pseudo/Token.h"
+#include "clang/Basic/LangOptions.h"
+#include "clang/Basic/TokenKinds.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+namespace clang {
+namespace pseudo {
+namespace {
+
+using testing::_;
+using testing::ElementsAre;
+using testing::Matcher;
+using testing::Pair;
+using testing::StrEq;
+using Chunk = DirectiveMap::Chunk;
+
+MATCHER_P2(tokensAre, TS, Tokens, "tokens are " + std::string(Tokens)) {
+ std::vector<llvm::StringRef> Texts;
+ for (const Token &Tok : TS.tokens(arg.Tokens))
+ Texts.push_back(Tok.text());
+ return Matcher<std::string>(StrEq(Tokens))
+ .MatchAndExplain(llvm::join(Texts, " "), result_listener);
+}
+
+MATCHER_P(chunkKind, K, "") { return arg.kind() == K; }
+
+TEST(DirectiveMap, Parse) {
+ LangOptions Opts;
+ std::string Code = R"cpp(
+ #include <foo.h>
+
+ int main() {
+ #ifdef HAS_FOO
+ #if HAS_BAR
+ foo(bar);
+ #else
+ foo(0)
+ #endif
+ #elif NEEDS_FOO
+ #error missing_foo
+ #endif
+ }
+ )cpp";
+
+ TokenStream S = cook(lex(Code, Opts), Opts);
+ DirectiveMap PP = DirectiveMap::parse(S);
+
+ ASSERT_THAT(PP.Chunks, ElementsAre(chunkKind(Chunk::K_Directive),
+ chunkKind(Chunk::K_Code),
+ chunkKind(Chunk::K_Conditional),
+ chunkKind(Chunk::K_Code)));
+
+ EXPECT_THAT((const DirectiveMap::Directive &)PP.Chunks[0],
+ tokensAre(S, "# include < foo . h >"));
+ EXPECT_THAT((const DirectiveMap::Code &)PP.Chunks[1],
+ tokensAre(S, "int main ( ) {"));
+ EXPECT_THAT((const DirectiveMap::Code &)PP.Chunks[3], tokensAre(S, "}"));
+
+ const DirectiveMap::Conditional &Ifdef(PP.Chunks[2]);
+ EXPECT_THAT(Ifdef.Branches,
+ ElementsAre(Pair(tokensAre(S, "# ifdef HAS_FOO"), _),
+ Pair(tokensAre(S, "# elif NEEDS_FOO"), _)));
+ EXPECT_THAT(Ifdef.End, tokensAre(S, "# endif"));
+
+ const DirectiveMap &HasFoo(Ifdef.Branches[0].second);
+ const DirectiveMap &NeedsFoo(Ifdef.Branches[1].second);
+
+ EXPECT_THAT(HasFoo.Chunks, ElementsAre(chunkKind(Chunk::K_Conditional)));
+ const DirectiveMap::Conditional &If(HasFoo.Chunks[0]);
+ EXPECT_THAT(If.Branches, ElementsAre(Pair(tokensAre(S, "# if HAS_BAR"), _),
+ Pair(tokensAre(S, "# else"), _)));
+ EXPECT_THAT(If.Branches[0].second.Chunks,
+ ElementsAre(chunkKind(Chunk::K_Code)));
+ EXPECT_THAT(If.Branches[1].second.Chunks,
+ ElementsAre(chunkKind(Chunk::K_Code)));
+
+ EXPECT_THAT(NeedsFoo.Chunks, ElementsAre(chunkKind(Chunk::K_Directive)));
+ const DirectiveMap::Directive &Error(NeedsFoo.Chunks[0]);
+ EXPECT_THAT(Error, tokensAre(S, "# error missing_foo"));
+ EXPECT_EQ(Error.Kind, tok::pp_error);
+}
+
+TEST(DirectiveMap, ParseUgly) {
+ LangOptions Opts;
+ std::string Code = R"cpp(
+ /*A*/ # /*B*/ \
+ /*C*/ \
+define \
+BAR /*D*/
+/*E*/
+)cpp";
+ TokenStream S = cook(lex(Code, Opts), Opts);
+ DirectiveMap PP = DirectiveMap::parse(S);
+
+ ASSERT_THAT(PP.Chunks, ElementsAre(chunkKind(Chunk::K_Code),
+ chunkKind(Chunk::K_Directive),
+ chunkKind(Chunk::K_Code)));
+ EXPECT_THAT((const DirectiveMap::Code &)PP.Chunks[0], tokensAre(S, "/*A*/"));
+ const DirectiveMap::Directive &Define(PP.Chunks[1]);
+ EXPECT_EQ(Define.Kind, tok::pp_define);
+ EXPECT_THAT(Define, tokensAre(S, "# /*B*/ /*C*/ define BAR /*D*/"));
+ EXPECT_THAT((const DirectiveMap::Code &)PP.Chunks[2], tokensAre(S, "/*E*/"));
+}
+
+TEST(DirectiveMap, ParseBroken) {
+ LangOptions Opts;
+ std::string Code = R"cpp(
+ a
+ #endif // mismatched
+ #if X
+ b
+)cpp";
+ TokenStream S = cook(lex(Code, Opts), Opts);
+ DirectiveMap PP = DirectiveMap::parse(S);
+
+ ASSERT_THAT(PP.Chunks, ElementsAre(chunkKind(Chunk::K_Code),
+ chunkKind(Chunk::K_Directive),
+ chunkKind(Chunk::K_Conditional)));
+ EXPECT_THAT((const DirectiveMap::Code &)PP.Chunks[0], tokensAre(S, "a"));
+ const DirectiveMap::Directive &Endif(PP.Chunks[1]);
+ EXPECT_EQ(Endif.Kind, tok::pp_endif);
+ EXPECT_THAT(Endif, tokensAre(S, "# endif // mismatched"));
+
+ const DirectiveMap::Conditional &X(PP.Chunks[2]);
+ EXPECT_EQ(1u, X.Branches.size());
+ // The (only) branch of the broken conditional section runs until eof.
+ EXPECT_EQ(tok::pp_if, X.Branches.front().first.Kind);
+ EXPECT_THAT(X.Branches.front().second.Chunks,
+ ElementsAre(chunkKind(Chunk::K_Code)));
+ // The missing terminating directive is marked as pp_not_keyword.
+ EXPECT_EQ(tok::pp_not_keyword, X.End.Kind);
+ EXPECT_EQ(0u, X.End.Tokens.size());
+}
+
+} // namespace
+} // namespace pseudo
+} // namespace clang
--- /dev/null
+//===--- GrammarTest.cpp - grammar tests -----------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang-pseudo/Grammar.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include <memory>
+
+namespace clang {
+namespace pseudo {
+namespace {
+
+using testing::AllOf;
+using testing::ElementsAre;
+using testing::IsEmpty;
+using testing::Pair;
+using testing::UnorderedElementsAre;
+
+MATCHER_P(TargetID, SID, "") { return arg.Target == SID; }
+template <typename... T> testing::Matcher<const Rule &> Sequence(T... IDs) {
+ return testing::Property(&Rule::seq, ElementsAre(IDs...));
+}
+
+class GrammarTest : public ::testing::Test {
+public:
+ void build(llvm::StringRef BNF) {
+ Diags.clear();
+ G = Grammar::parseBNF(BNF, Diags);
+ }
+
+ SymbolID id(llvm::StringRef Name) const {
+ for (unsigned I = 0; I < NumTerminals; ++I)
+ if (G->table().Terminals[I] == Name)
+ return tokenSymbol(static_cast<tok::TokenKind>(I));
+ for (SymbolID ID = 0; ID < G->table().Nonterminals.size(); ++ID)
+ if (G->table().Nonterminals[ID].Name == Name)
+ return ID;
+ ADD_FAILURE() << "No such symbol found: " << Name;
+ return 0;
+ }
+
+protected:
+ std::unique_ptr<Grammar> G;
+ std::vector<std::string> Diags;
+};
+
+TEST_F(GrammarTest, Basic) {
+ build("_ := IDENTIFIER + _ # comment");
+ EXPECT_THAT(Diags, IsEmpty());
+
+ auto ExpectedRule =
+ AllOf(TargetID(id("_")), Sequence(id("IDENTIFIER"), id("+"), id("_")));
+ EXPECT_EQ(G->symbolName(id("_")), "_");
+ EXPECT_THAT(G->rulesFor(id("_")), UnorderedElementsAre(ExpectedRule));
+ const auto &Rule = G->lookupRule(/*RID=*/0);
+ EXPECT_THAT(Rule, ExpectedRule);
+ EXPECT_THAT(G->symbolName(Rule.seq()[0]), "IDENTIFIER");
+ EXPECT_THAT(G->symbolName(Rule.seq()[1]), "+");
+ EXPECT_THAT(G->symbolName(Rule.seq()[2]), "_");
+}
+
+TEST_F(GrammarTest, EliminatedOptional) {
+ build("_ := CONST_opt INT ;_opt");
+ EXPECT_THAT(Diags, IsEmpty());
+ EXPECT_THAT(G->table().Rules,
+ UnorderedElementsAre(Sequence(id("INT")),
+ Sequence(id("CONST"), id("INT")),
+ Sequence(id("CONST"), id("INT"), id(";")),
+ Sequence(id("INT"), id(";"))));
+}
+
+TEST_F(GrammarTest, Diagnostics) {
+ build(R"cpp(
+ _ := ,_opt
+ _ := undefined-sym
+ null :=
+ _ := IDENFIFIE # a typo of the terminal IDENFITIER
+
+ invalid
+ )cpp");
+
+ EXPECT_EQ(G->startSymbol(), id("_"));
+ EXPECT_THAT(Diags, UnorderedElementsAre(
+ "Rule '_ := ,_opt' has a nullable RHS",
+ "Rule 'null := ' has a nullable RHS",
+ "No rules for nonterminal: undefined-sym",
+ "Failed to parse 'invalid': no separator :=",
+ "Token-like name IDENFIFIE is used as a nonterminal",
+ "No rules for nonterminal: IDENFIFIE"));
+}
+
+TEST_F(GrammarTest, FirstAndFollowSets) {
+ build(
+ R"bnf(
+_ := expr
+expr := expr - term
+expr := term
+term := IDENTIFIER
+term := ( expr )
+)bnf");
+ ASSERT_TRUE(Diags.empty());
+ auto ToPairs = [](std::vector<llvm::DenseSet<SymbolID>> Input) {
+ std::vector<std::pair<SymbolID, llvm::DenseSet<SymbolID>>> Sets;
+ for (SymbolID ID = 0; ID < Input.size(); ++ID)
+ Sets.emplace_back(ID, std::move(Input[ID]));
+ return Sets;
+ };
+
+ EXPECT_THAT(
+ ToPairs(firstSets(*G)),
+ UnorderedElementsAre(
+ Pair(id("_"), UnorderedElementsAre(id("IDENTIFIER"), id("("))),
+ Pair(id("expr"), UnorderedElementsAre(id("IDENTIFIER"), id("("))),
+ Pair(id("term"), UnorderedElementsAre(id("IDENTIFIER"), id("(")))));
+ EXPECT_THAT(
+ ToPairs(followSets(*G)),
+ UnorderedElementsAre(
+ Pair(id("_"), UnorderedElementsAre(id("EOF"))),
+ Pair(id("expr"), UnorderedElementsAre(id("-"), id("EOF"), id(")"))),
+ Pair(id("term"), UnorderedElementsAre(id("-"), id("EOF"), id(")")))));
+
+ build(R"bnf(
+# A simplfied C++ decl-specifier-seq.
+_ := decl-specifier-seq
+decl-specifier-seq := decl-specifier decl-specifier-seq
+decl-specifier-seq := decl-specifier
+decl-specifier := simple-type-specifier
+decl-specifier := INLINE
+simple-type-specifier := INT
+ )bnf");
+ ASSERT_TRUE(Diags.empty());
+ EXPECT_THAT(
+ ToPairs(firstSets(*G)),
+ UnorderedElementsAre(
+ Pair(id("_"), UnorderedElementsAre(id("INLINE"), id("INT"))),
+ Pair(id("decl-specifier-seq"),
+ UnorderedElementsAre(id("INLINE"), id("INT"))),
+ Pair(id("simple-type-specifier"), UnorderedElementsAre(id("INT"))),
+ Pair(id("decl-specifier"),
+ UnorderedElementsAre(id("INLINE"), id("INT")))));
+ EXPECT_THAT(
+ ToPairs(followSets(*G)),
+ UnorderedElementsAre(
+ Pair(id("_"), UnorderedElementsAre(id("EOF"))),
+ Pair(id("decl-specifier-seq"), UnorderedElementsAre(id("EOF"))),
+ Pair(id("decl-specifier"),
+ UnorderedElementsAre(id("INLINE"), id("INT"), id("EOF"))),
+ Pair(id("simple-type-specifier"),
+ UnorderedElementsAre(id("INLINE"), id("INT"), id("EOF")))));
+}
+
+} // namespace
+} // namespace pseudo
+} // namespace clang
--- /dev/null
+//===--- LRTableTest.cpp - ---------------------------------------*- C++-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang-pseudo/LRTable.h"
+#include "clang-pseudo/Grammar.h"
+#include "clang/Basic/TokenKinds.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include <vector>
+
+namespace clang {
+namespace pseudo {
+namespace {
+
+using testing::IsEmpty;
+using testing::UnorderedElementsAre;
+using Action = LRTable::Action;
+
+TEST(LRTable, Builder) {
+ GrammarTable GTable;
+
+ // eof semi ...
+ // +-------+----+-------+---
+ // |state0 | | s0,r0 |...
+ // |state1 | acc| |...
+ // |state2 | | r1 |...
+ // +-------+----+-------+---
+ std::vector<LRTable::Entry> Entries = {
+ {/* State */ 0, tokenSymbol(tok::semi), Action::shift(0)},
+ {/* State */ 0, tokenSymbol(tok::semi), Action::reduce(0)},
+ {/* State */ 1, tokenSymbol(tok::eof), Action::accept(2)},
+ {/* State */ 2, tokenSymbol(tok::semi), Action::reduce(1)}};
+ GrammarTable GT;
+ LRTable T = LRTable::buildForTests(GT, Entries);
+ EXPECT_THAT(T.find(0, tokenSymbol(tok::eof)), IsEmpty());
+ EXPECT_THAT(T.find(0, tokenSymbol(tok::semi)),
+ UnorderedElementsAre(Action::shift(0), Action::reduce(0)));
+ EXPECT_THAT(T.find(1, tokenSymbol(tok::eof)),
+ UnorderedElementsAre(Action::accept(2)));
+ EXPECT_THAT(T.find(1, tokenSymbol(tok::semi)), IsEmpty());
+ EXPECT_THAT(T.find(2, tokenSymbol(tok::semi)),
+ UnorderedElementsAre(Action::reduce(1)));
+ // Verify the behaivor for other non-available-actions terminals.
+ EXPECT_THAT(T.find(2, tokenSymbol(tok::kw_int)), IsEmpty());
+}
+
+} // namespace
+} // namespace pseudo
+} // namespace clang
--- /dev/null
+//===--- TokenTest.cpp ----------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang-pseudo/Token.h"
+#include "clang/Basic/LangOptions.h"
+#include "clang/Basic/TokenKinds.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+namespace clang {
+namespace pseudo {
+namespace {
+
+using testing::AllOf;
+using testing::ElementsAre;
+using testing::ElementsAreArray;
+using testing::Not;
+
+MATCHER_P2(token, Text, Kind, "") {
+ return arg.Kind == Kind && arg.text() == Text;
+}
+
+MATCHER_P(hasFlag, Flag, "") { return arg.flag(Flag); }
+
+MATCHER_P2(lineIndent, Line, Indent, "") {
+ return arg.Line == (unsigned)Line && arg.Indent == (unsigned)Indent;
+}
+
+TEST(TokenTest, Lex) {
+ LangOptions Opts;
+ std::string Code = R"cpp(
+ #include <stdio.h>
+ int main() {
+ return 42; // the answer
+ }
+ )cpp";
+ TokenStream Raw = lex(Code, Opts);
+ ASSERT_TRUE(Raw.isFinalized());
+ EXPECT_THAT(Raw.tokens(),
+ ElementsAreArray({
+ // Lexing of directives is weird, especially <angled> strings.
+ token("#", tok::hash),
+ token("include", tok::raw_identifier),
+ token("<", tok::less),
+ token("stdio", tok::raw_identifier),
+ token(".", tok::period),
+ token("h", tok::raw_identifier),
+ token(">", tok::greater),
+
+ token("int", tok::raw_identifier),
+ token("main", tok::raw_identifier),
+ token("(", tok::l_paren),
+ token(")", tok::r_paren),
+ token("{", tok::l_brace),
+ token("return", tok::raw_identifier),
+ token("42", tok::numeric_constant),
+ token(";", tok::semi),
+ token("// the answer", tok::comment),
+ token("}", tok::r_brace),
+ }));
+
+ TokenStream Cooked = cook(Raw, Opts);
+ ASSERT_TRUE(Cooked.isFinalized());
+ EXPECT_THAT(Cooked.tokens(),
+ ElementsAreArray({
+ // Cooked identifier types in directives are not meaningful.
+ token("#", tok::hash),
+ token("include", tok::identifier),
+ token("<", tok::less),
+ token("stdio", tok::identifier),
+ token(".", tok::period),
+ token("h", tok::identifier),
+ token(">", tok::greater),
+
+ token("int", tok::kw_int),
+ token("main", tok::identifier),
+ token("(", tok::l_paren),
+ token(")", tok::r_paren),
+ token("{", tok::l_brace),
+ token("return", tok::kw_return),
+ token("42", tok::numeric_constant),
+ token(";", tok::semi),
+ token("// the answer", tok::comment),
+ token("}", tok::r_brace),
+ }));
+ // Check raw tokens point back into original source code.
+ EXPECT_EQ(Raw.tokens().front().text().begin(), &Code[Code.find('#')]);
+}
+
+TEST(TokenTest, LineContinuation) {
+ LangOptions Opts;
+ std::string Code = R"cpp(
+one_\
+token
+two \
+tokens
+ )cpp";
+ TokenStream Raw = lex(Code, Opts);
+ EXPECT_THAT(
+ Raw.tokens(),
+ ElementsAre(AllOf(token("one_\\\ntoken", tok::raw_identifier),
+ hasFlag(LexFlags::StartsPPLine),
+ hasFlag(LexFlags::NeedsCleaning), lineIndent(1, 0)),
+ AllOf(token("two", tok::raw_identifier),
+ hasFlag(LexFlags::StartsPPLine),
+ Not(hasFlag(LexFlags::NeedsCleaning))),
+ AllOf(token("\\\ntokens", tok::raw_identifier),
+ Not(hasFlag(LexFlags::StartsPPLine)),
+ hasFlag(LexFlags::NeedsCleaning))));
+
+ TokenStream Cooked = cook(Raw, Opts);
+ EXPECT_THAT(
+ Cooked.tokens(),
+ ElementsAre(AllOf(token("one_token", tok::identifier), lineIndent(1, 0)),
+ token("two", tok::identifier),
+ token("tokens", tok::identifier)));
+}
+
+TEST(TokenTest, EncodedCharacters) {
+ LangOptions Opts;
+ Opts.Trigraphs = true;
+ Opts.Digraphs = true;
+ Opts.C99 = true; // UCNs
+ Opts.CXXOperatorNames = true;
+ std::string Code = R"(and <: ??! '??=' \u00E9)";
+ TokenStream Raw = lex(Code, Opts);
+ EXPECT_THAT(
+ Raw.tokens(),
+ ElementsAre( // and is not recognized as && until cook().
+ AllOf(token("and", tok::raw_identifier),
+ Not(hasFlag(LexFlags::NeedsCleaning))),
+ // Digraphs are just different spellings of tokens.
+ AllOf(token("<:", tok::l_square),
+ Not(hasFlag(LexFlags::NeedsCleaning))),
+ // Trigraps are interpreted, still need text cleaning.
+ AllOf(token(R"(??!)", tok::pipe), hasFlag(LexFlags::NeedsCleaning)),
+ // Trigraphs must be substituted inside constants too.
+ AllOf(token(R"('??=')", tok::char_constant),
+ hasFlag(LexFlags::NeedsCleaning)),
+ // UCNs need substitution.
+ AllOf(token(R"(\u00E9)", tok::raw_identifier),
+ hasFlag(LexFlags::NeedsCleaning))));
+
+ TokenStream Cooked = cook(Raw, Opts);
+ EXPECT_THAT(
+ Cooked.tokens(),
+ ElementsAre(token("and", tok::ampamp), // alternate spelling recognized
+ token("<:", tok::l_square),
+ token("|", tok::pipe), // trigraph substituted
+ token("'#'", tok::char_constant), // trigraph substituted
+ token("é", tok::identifier))); // UCN substituted
+}
+
+TEST(TokenTest, Indentation) {
+ LangOptions Opts;
+ std::string Code = R"cpp( hello world
+no_indent \
+ line_was_continued
+)cpp";
+ TokenStream Raw = lex(Code, Opts);
+ EXPECT_THAT(Raw.tokens(), ElementsAreArray({
+ lineIndent(0, 3), // hello
+ lineIndent(0, 3), // world
+ lineIndent(1, 0), // no_indent
+ lineIndent(2, 2), // line_was_continued
+ }));
+}
+
+TEST(TokenTest, DropComments) {
+ LangOptions Opts;
+ std::string Code = R"cpp(
+ // comment
+ int /*abc*/;
+)cpp";
+ TokenStream Raw = cook(lex(Code, Opts), Opts);
+ TokenStream Stripped = stripComments(Raw);
+ EXPECT_THAT(Raw.tokens(),
+ ElementsAreArray(
+ {token("// comment", tok::comment), token("int", tok::kw_int),
+ token("/*abc*/", tok::comment), token(";", tok::semi)}));
+
+ EXPECT_THAT(Stripped.tokens(), ElementsAreArray({token("int", tok::kw_int),
+ token(";", tok::semi)}));
+}
+
+} // namespace
+} // namespace pseudo
+} // namespace clang
+++ /dev/null
-//===--- DirectiveMap.h - Find and strip preprocessor directives -*- C++-*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// The pseudoparser tries to match a token stream to the C++ grammar.
-// Preprocessor #defines and other directives are not part of this grammar, and
-// should be removed before the file can be parsed.
-//
-// Conditional blocks like #if...#else...#endif are particularly tricky, as
-// simply stripping the directives may not produce a grammatical result:
-//
-// return
-// #ifndef DEBUG
-// 1
-// #else
-// 0
-// #endif
-// ;
-//
-// This header supports analyzing and removing the directives in a source file.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_CLANG_TOOLING_SYNTAX_PREPROCESS_H
-#define LLVM_CLANG_TOOLING_SYNTAX_PREPROCESS_H
-
-#include "clang/Basic/TokenKinds.h"
-#include "clang/Tooling/Syntax/Pseudo/Token.h"
-#include <vector>
-
-namespace clang {
-class LangOptions;
-namespace syntax {
-namespace pseudo {
-
-/// Describes the structure of a source file, as seen by the preprocessor.
-///
-/// The structure is a tree, whose leaves are plain source code and directives,
-/// and whose internal nodes are #if...#endif sections.
-///
-/// (root)
-/// |-+ Directive #include <stdio.h>
-/// |-+ Code int main() {
-/// | ` printf("hello, ");
-/// |-+ Conditional -+ Directive #ifndef NDEBUG
-/// | |-+ Code printf("debug\n");
-/// | |-+ Directive #else
-/// | |-+ Code printf("production\n");
-/// | `-+ Directive #endif
-/// |-+ Code return 0;
-/// ` }
-///
-/// Unlike the clang preprocessor, we model the full tree explicitly.
-/// This class does not recognize macro usage, only directives.
-struct DirectiveMap {
- /// A range of code (and possibly comments) containing no directives.
- struct Code {
- Token::Range Tokens;
- };
- /// A preprocessor directive.
- struct Directive {
- /// Raw tokens making up the directive, starting with `#`.
- Token::Range Tokens;
- clang::tok::PPKeywordKind Kind = clang::tok::pp_not_keyword;
- };
- /// A preprocessor conditional section.
- ///
- /// This starts with an #if, #ifdef, #ifndef etc directive.
- /// It covers all #else branches, and spans until the matching #endif.
- struct Conditional {
- /// The sequence of directives that introduce top-level alternative parses.
- ///
- /// The first branch will have an #if type directive.
- /// Subsequent branches will have #else type directives.
- std::vector<std::pair<Directive, DirectiveMap>> Branches;
- /// The directive terminating the conditional, should be #endif.
- Directive End;
- };
-
- /// Some piece of the file. {One of Code, Directive, Conditional}.
- class Chunk; // Defined below.
- std::vector<Chunk> Chunks;
-
- /// Extract preprocessor structure by examining the raw tokens.
- static DirectiveMap parse(const TokenStream &);
-
- // FIXME: add heuristically selection of conditional branches.
- // FIXME: allow deriving a preprocessed stream
-};
-llvm::raw_ostream &operator<<(llvm::raw_ostream &, const DirectiveMap &);
-llvm::raw_ostream &operator<<(llvm::raw_ostream &, const DirectiveMap::Chunk &);
-llvm::raw_ostream &operator<<(llvm::raw_ostream &, const DirectiveMap::Code &);
-llvm::raw_ostream &operator<<(llvm::raw_ostream &,
- const DirectiveMap::Directive &);
-llvm::raw_ostream &operator<<(llvm::raw_ostream &,
- const DirectiveMap::Conditional &);
-
-// FIXME: This approximates std::variant<Code, Directive, Conditional>.
-// Switch once we can use C++17.
-class DirectiveMap::Chunk {
-public:
- enum Kind { K_Empty, K_Code, K_Directive, K_Conditional };
- Kind kind() const {
- return CodeVariant ? K_Code
- : DirectiveVariant ? K_Directive
- : ConditionalVariant ? K_Conditional
- : K_Empty;
- }
-
- Chunk() = delete;
- Chunk(const Chunk &) = delete;
- Chunk(Chunk &&) = default;
- Chunk &operator=(const Chunk &) = delete;
- Chunk &operator=(Chunk &&) = default;
- ~Chunk() = default;
-
- // T => Chunk constructor.
- Chunk(Code C) : CodeVariant(std::move(C)) {}
- Chunk(Directive C) : DirectiveVariant(std::move(C)) {}
- Chunk(Conditional C) : ConditionalVariant(std::move(C)) {}
-
- // Chunk => T& and const T& conversions.
-#define CONVERSION(CONST, V) \
- explicit operator CONST V &() CONST { return *V##Variant; }
- CONVERSION(const, Code);
- CONVERSION(, Code);
- CONVERSION(const, Directive);
- CONVERSION(, Directive);
- CONVERSION(const, Conditional);
- CONVERSION(, Conditional);
-#undef CONVERSION
-
-private:
- // Wasteful, a union variant would be better!
- llvm::Optional<Code> CodeVariant;
- llvm::Optional<Directive> DirectiveVariant;
- llvm::Optional<Conditional> ConditionalVariant;
-};
-
-} // namespace pseudo
-} // namespace syntax
-} // namespace clang
-
-#endif
+++ /dev/null
-//===--- Grammar.h - grammar used by clang pseudo parser --------*- C++-*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines base structures for parsing & modeling a grammar for a
-// programming language:
-//
-// # This is a fake C++ BNF grammar
-// _ := translation-unit
-// translation-unit := declaration-seq_opt
-// declaration-seq := declaration
-// declaration-seq := declaration-seq declaration
-//
-// A grammar formally describes a language, and it is constructed by a set of
-// production rules. A rule is of BNF form (AAA := BBB CCC). A symbol is either
-// non-terminal or terminal, identified by a SymbolID.
-//
-// Notions about the BNF grammar:
-// - "_" is the start symbol of the augmented grammar;
-// - single-line comment is supported, starting with a #
-// - A rule describes how a nonterminal (left side of :=) is constructed, and
-// it is *per line* in the grammar file
-// - Terminals (also called tokens) correspond to the clang::TokenKind; they
-// are written in the grammar like "IDENTIFIER", "USING", "+"
-// - Nonterminals are specified with "lower-case" names in the grammar; they
-// shouldn't be nullable (has an empty sequence)
-// - optional symbols are supported (specified with a _opt suffix), and they
-// will be eliminated during the grammar parsing stage
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_CLANG_TOOLING_SYNTAX_GRAMMAR_H
-#define LLVM_CLANG_TOOLING_SYNTAX_GRAMMAR_H
-
-#include "clang/Basic/TokenKinds.h"
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/DenseSet.h"
-#include "llvm/ADT/StringRef.h"
-#include <cstdint>
-#include <vector>
-
-namespace clang {
-namespace syntax {
-namespace pseudo {
-// A SymbolID uniquely identifies a terminal/non-terminal symbol in a grammar.
-// Non-terminal IDs are indexes into a table of non-terminal symbols.
-// Terminal IDs correspond to the clang TokenKind enum.
-using SymbolID = uint16_t;
-// SymbolID is only 12 bits wide.
-// There are maximum 2^11 terminals (aka tokens) and 2^11 nonterminals.
-static constexpr uint16_t SymbolBits = 12;
-static constexpr uint16_t NumTerminals = tok::NUM_TOKENS;
-// SymbolIDs with the top bit set are tokens/terminals.
-static constexpr SymbolID TokenFlag = 1 << (SymbolBits - 1);
-inline bool isToken(SymbolID ID) { return ID & TokenFlag; }
-inline bool isNonterminal(SymbolID ID) { return !isToken(ID); }
-// The terminals are always the clang tok::TokenKind (not all are used).
-inline tok::TokenKind symbolToToken(SymbolID SID) {
- assert(isToken(SID));
- SID &= ~TokenFlag;
- assert(SID < NumTerminals);
- return static_cast<tok::TokenKind>(SID);
-}
-inline SymbolID tokenSymbol(tok::TokenKind TK) {
- return TokenFlag | static_cast<SymbolID>(TK);
-}
-
-// A RuleID uniquely identifies a production rule in a grammar.
-// It is an index into a table of rules.
-using RuleID = uint16_t;
-// There are maximum 2^12 rules.
-static constexpr unsigned RuleBits = 12;
-
-// Represent a production rule in the grammar, e.g.
-// expression := a b c
-// ^Target ^Sequence
-struct Rule {
- Rule(SymbolID Target, llvm::ArrayRef<SymbolID> Seq);
-
- // We occupy 4 bits for the sequence, in theory, it can be at most 2^4 tokens
- // long, however, we're stricter in order to reduce the size, we limit the max
- // length to 9 (this is the longest sequence in cxx grammar).
- static constexpr unsigned SizeBits = 4;
- static constexpr unsigned MaxElements = 9;
- static_assert(MaxElements <= (1 << SizeBits), "Exceeds the maximum limit");
- static_assert(SizeBits + SymbolBits <= 16,
- "Must be able to store symbol ID + size efficiently");
-
- // 16 bits for target symbol and size of sequence:
- // SymbolID : 12 | Size : 4
- SymbolID Target : SymbolBits;
- uint8_t Size : SizeBits; // Size of the Sequence
- SymbolID Sequence[MaxElements];
-
- llvm::ArrayRef<SymbolID> seq() const {
- return llvm::ArrayRef<SymbolID>(Sequence, Size);
- }
- friend bool operator==(const Rule &L, const Rule &R) {
- return L.Target == R.Target && L.seq() == R.seq();
- }
-};
-
-struct GrammarTable;
-
-// Grammar that describes a programming language, e.g. C++. It represents the
-// contents of the specified grammar.
-// It is a building block for constructing a table-based parser.
-class Grammar {
-public:
- explicit Grammar(std::unique_ptr<GrammarTable>);
-
- // Parses grammar from a BNF file.
- // Diagnostics emitted during parsing are stored in Diags.
- static std::unique_ptr<Grammar> parseBNF(llvm::StringRef BNF,
- std::vector<std::string> &Diags);
-
- // Returns the SymbolID of the start symbol '_'.
- SymbolID startSymbol() const { return StartSymbol; };
-
- // Returns all rules of the given non-terminal symbol.
- llvm::ArrayRef<Rule> rulesFor(SymbolID SID) const;
- const Rule &lookupRule(RuleID RID) const;
-
- // Gets symbol (terminal or non-terminal) name.
- // Terminals have names like "," (kw_comma) or "OPERATOR" (kw_operator).
- llvm::StringRef symbolName(SymbolID) const;
-
- // Dumps the whole grammar.
- std::string dump() const;
- // Dumps a particular rule.
- std::string dumpRule(RuleID) const;
- // Dumps all rules of the given nonterminal symbol.
- std::string dumpRules(SymbolID) const;
-
- const GrammarTable &table() const { return *T; }
-
-private:
- std::unique_ptr<GrammarTable> T;
- // The start symbol '_' of the augmented grammar.
- SymbolID StartSymbol;
-};
-// For each nonterminal X, computes the set of terminals that begin strings
-// derived from X. (Known as FIRST sets in grammar-based parsers).
-std::vector<llvm::DenseSet<SymbolID>> firstSets(const Grammar &);
-// For each nonterminal X, computes the set of terminals that could immediately
-// follow X. (Known as FOLLOW sets in grammar-based parsers).
-std::vector<llvm::DenseSet<SymbolID>> followSets(const Grammar &);
-
-// Storage for the underlying data of the Grammar.
-// It can be constructed dynamically (from compiling BNF file) or statically
-// (a compiled data-source).
-struct GrammarTable {
- GrammarTable();
-
- struct Nonterminal {
- std::string Name;
- // Corresponding rules that construct the non-terminal, it is a [start, end)
- // index range of the Rules table.
- struct {
- RuleID start;
- RuleID end;
- } RuleRange;
- };
-
- // The rules are sorted (and thus grouped) by target symbol.
- // RuleID is the index of the vector.
- std::vector<Rule> Rules;
- // A table of terminals (aka tokens). It corresponds to the clang::Token.
- // clang::tok::TokenKind is the index of the table.
- llvm::ArrayRef<std::string> Terminals;
- // A table of nonterminals, sorted by name.
- // SymbolID is the index of the table.
- std::vector<Nonterminal> Nonterminals;
-};
-
-} // namespace pseudo
-} // namespace syntax
-} // namespace clang
-
-#endif // LLVM_CLANG_TOOLING_SYNTAX_GRAMMAR_H
+++ /dev/null
-//===--- LRGraph.h - Build an LR automaton ------------------*- C++-*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// LR parsers are bottom-up parsers -- they scan the input from left to right,
-// and collect the right-hand side of a production rule (called handle) on top
-// of the stack, then replace (reduce) the handle with the nonterminal defined
-// by the production rule.
-//
-// This file defines LRGraph, a deterministic handle-finding finite-state
-// automaton, which is a key component in LR parsers to recognize any of
-// handles in the grammar efficiently. We build the LR table (ACTION and GOTO
-// Table) based on the LRGraph.
-//
-// LRGraph can be constructed for any context-free grammars.
-// Even for a LR-ambiguous grammar, we can construct a deterministic FSA, but
-// interpretation of the FSA is nondeterministic -- we might in a state where
-// we can continue searching an handle and identify a handle (called
-// shift/reduce conflicts), or identify more than one handle (callled
-// reduce/reduce conflicts).
-//
-// LRGraph is a common model for all variants of LR automatons, from the most
-// basic one LR(0), the powerful SLR(1), LR(1) which uses a one-token lookahead
-// in making decisions.
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_CLANG_TOOLING_SYNTAX_PSEUDO_LRGRAPH_H
-#define LLVM_CLANG_TOOLING_SYNTAX_PSEUDO_LRGRAPH_H
-
-#include "clang/Tooling/Syntax/Pseudo/Grammar.h"
-#include "llvm/ADT/Hashing.h"
-#include <vector>
-
-namespace clang {
-namespace syntax {
-namespace pseudo {
-
-// An LR item -- a grammar rule with a dot at some position of the body.
-// e.g. a production rule A := X Y yields 3 items:
-// A := . X Y
-// A := X . Y
-// A := X Y .
-// An item indicates how much of a production rule has been recognized at a
-// position (described by dot), for example, A := X . Y indicates that we have
-// recognized the X part from the input, and we hope next to see the input
-// derivable from Y.
-class Item {
-public:
- static Item start(RuleID ID, const Grammar &G) {
- Item I;
- I.RID = ID;
- I.RuleLength = G.lookupRule(ID).Size;
- return I;
- }
- static Item sentinel(RuleID ID) {
- Item I;
- I.RID = ID;
- return I;
- }
-
- RuleID rule() const { return RID; }
- uint8_t dot() const { return DotPos; }
-
- bool hasNext() const { return DotPos < RuleLength; }
- SymbolID next(const Grammar &G) const {
- assert(hasNext());
- return G.lookupRule(RID).Sequence[DotPos];
- }
-
- Item advance() const {
- assert(hasNext());
- Item I = *this;
- ++I.DotPos;
- return I;
- }
-
- std::string dump(const Grammar &G) const;
-
- bool operator==(const Item &I) const {
- return DotPos == I.DotPos && RID == I.RID;
- }
- bool operator<(const Item &I) const {
- return std::tie(RID, DotPos) < std::tie(I.RID, I.DotPos);
- }
- friend llvm::hash_code hash_value(const Item &I) {
- return llvm::hash_combine(I.RID, I.DotPos);
- }
-
-private:
- RuleID RID = 0;
- uint8_t DotPos = 0;
- uint8_t RuleLength = 0; // the length of rule body.
-};
-
-// A state represents a node in the LR automaton graph. It is an item set, which
-// contains all possible rules that the LR parser may be parsing in that state.
-//
-// Conceptually, If we knew in advance what we're parsing, at any point we're
-// partway through parsing a production, sitting on a stack of partially parsed
-// productions. But because we don't know, there could be *several* productions
-// we're partway through. The set of possibilities is the parser state, and we
-// precompute all the transitions between these states.
-struct State {
- // A full set of items (including non-kernel items) representing the state,
- // in a canonical order (see SortByNextSymbol in the cpp file).
- std::vector<Item> Items;
-
- std::string dump(const Grammar &G, unsigned Indent = 0) const;
-};
-
-// LRGraph is a deterministic finite state automaton for LR parsing.
-//
-// Intuitively, an LR automaton is a transition graph. The graph has a
-// collection of nodes, called States. Each state corresponds to a particular
-// item set, which represents a condition that could occur during the process of
-// parsing a production. Edges are directed from one state to another. Each edge
-// is labeled by a grammar symbol (terminal or nonterminal).
-//
-// LRGraph is used to construct the LR parsing table which is a core
-// data-structure driving the LR parser.
-class LRGraph {
-public:
- // StateID is the index in States table.
- using StateID = uint16_t;
-
- // Constructs an LR(0) automaton.
- static LRGraph buildLR0(const Grammar &);
-
- // An edge in the LR graph, it represents a transition in the LR automaton.
- // If the parser is at state Src, with a lookahead Label, then it
- // transits to state Dst.
- struct Edge {
- StateID Src, Dst;
- SymbolID Label;
- };
-
- llvm::ArrayRef<State> states() const { return States; }
- llvm::ArrayRef<Edge> edges() const { return Edges; }
-
- std::string dumpForTests(const Grammar &) const;
-
-private:
- LRGraph(std::vector<State> States, std::vector<Edge> Edges)
- : States(std::move(States)), Edges(std::move(Edges)) {}
-
- std::vector<State> States;
- std::vector<Edge> Edges;
-};
-
-} // namespace pseudo
-} // namespace syntax
-} // namespace clang
-
-namespace llvm {
-// Support clang::syntax::pseudo::Item as DenseMap keys.
-template <> struct DenseMapInfo<clang::syntax::pseudo::Item> {
- static inline clang::syntax::pseudo::Item getEmptyKey() {
- return clang::syntax::pseudo::Item::sentinel(-1);
- }
- static inline clang::syntax::pseudo::Item getTombstoneKey() {
- return clang::syntax::pseudo::Item::sentinel(-2);
- }
- static unsigned getHashValue(const clang::syntax::pseudo::Item &I) {
- return hash_value(I);
- }
- static bool isEqual(const clang::syntax::pseudo::Item &LHS,
- const clang::syntax::pseudo::Item &RHS) {
- return LHS == RHS;
- }
-};
-} // namespace llvm
-
-#endif // LLVM_CLANG_TOOLING_SYNTAX_PSEUDO_LRGRAPH_H
+++ /dev/null
-//===--- LRTable.h - Define LR Parsing Table ---------------------*- C++-*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// The LRTable (referred as LR parsing table in the LR literature) is the core
-// component in LR parsers, it drives the LR parsers by specifying an action to
-// take given the current state on the top of the stack and the current
-// lookahead token.
-//
-// The LRTable can be described as a matrix where the rows represent
-// the states of the LR graph, the columns represent the symbols of the
-// grammar, and each entry of the matrix (called action) represents a
-// state transition in the graph.
-//
-// Typically, based on the category of the grammar symbol, the LRTable is
-// broken into two logically separate tables:
-// - ACTION table with terminals as columns -- e.g ACTION[S, a] specifies
-// next action (shift/reduce/accept/error) on state S under a lookahead
-// terminal a
-// - GOTO table with nonterminals as columns -- e.g. GOTO[S, X] specify
-// the state which we transist to from the state S with the nonterminal X
-//
-// LRTable is *performance-critial* as it is consulted frequently during a
-// parse. In general, LRTable is very sparse (most of the entries are empty).
-// For example, for the C++ language, the SLR table has ~1500 states and 650
-// symbols which results in a matrix having 975K entries, ~90% of entries are
-// empty.
-//
-// This file implements a speed-and-space-efficient LRTable.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_CLANG_TOOLING_SYNTAX_PSEUDO_LRTABLE_H
-#define LLVM_CLANG_TOOLING_SYNTAX_PSEUDO_LRTABLE_H
-
-#include "clang/Tooling/Syntax/Pseudo/Grammar.h"
-#include "llvm/ADT/ArrayRef.h"
-#include <cstdint>
-#include <vector>
-
-namespace clang {
-namespace syntax {
-namespace pseudo {
-
-// Represents the LR parsing table, which can efficiently the question "what is
-// the next step given the lookahead token and current state on top of the
-// stack?".
-//
-// This is a dense implementation, which only takes an amount of space that is
-// proportional to the number of non-empty entries in the table.
-//
-// Unlike the typical LR parsing table which allows at most one available action
-// per entry, conflicted actions are allowed in LRTable. The LRTable is designed
-// to be used in nondeterministic LR parsers (e.g. GLR).
-class LRTable {
-public:
- // StateID is only 13 bits wide.
- using StateID = uint16_t;
- static constexpr unsigned StateBits = 13;
-
- // Action represents the terminal and nonterminal actions, it combines the
- // entry of the ACTION and GOTO tables from the LR literature.
- class Action {
- public:
- enum Kind : uint8_t {
- Sentinel = 0,
- // Terminal actions, corresponding to entries of ACTION table.
-
- // Shift to state n: move forward with the lookahead, and push state n
- // onto the state stack.
- // A shift is a forward transition, and the value n is the next state that
- // the parser is to enter.
- Shift,
- // Reduce by a rule: pop the state stack.
- Reduce,
- // Signals that we have parsed the input successfully.
- Accept,
-
- // Nonterminal actions, corresponding to entry of GOTO table.
-
- // Go to state n: push state n onto the state stack.
- // Similar to Shift, but it is a nonterminal forward transition.
- GoTo,
- };
-
- static Action accept(RuleID RID) { return Action(Accept, RID); }
- static Action goTo(StateID S) { return Action(GoTo, S); }
- static Action shift(StateID S) { return Action(Shift, S); }
- static Action reduce(RuleID RID) { return Action(Reduce, RID); }
- static Action sentinel() { return Action(Sentinel, 0); }
-
- StateID getShiftState() const {
- assert(kind() == Shift);
- return Value;
- }
- StateID getGoToState() const {
- assert(kind() == GoTo);
- return Value;
- }
- RuleID getReduceRule() const {
- assert(kind() == Reduce);
- return Value;
- }
- Kind kind() const { return static_cast<Kind>(K); }
-
- bool operator==(const Action &L) const { return opaque() == L.opaque(); }
- uint16_t opaque() const { return K << ValueBits | Value; };
-
- private:
- Action(Kind K1, unsigned Value) : K(K1), Value(Value) {}
- static constexpr unsigned ValueBits = StateBits;
- static constexpr unsigned KindBits = 3;
- static_assert(ValueBits >= RuleBits, "Value must be able to store RuleID");
- static_assert(KindBits + ValueBits <= 16,
- "Must be able to store kind and value efficiently");
- uint16_t K : KindBits;
- // Either StateID or RuleID, depending on the Kind.
- uint16_t Value : ValueBits;
- };
-
- // Returns all available actions for the given state on a terminal.
- // Expected to be called by LR parsers.
- llvm::ArrayRef<Action> getActions(StateID State, SymbolID Terminal) const;
- // Returns the state after we reduce a nonterminal.
- // Expected to be called by LR parsers.
- StateID getGoToState(StateID State, SymbolID Nonterminal) const;
-
- // Looks up available actions.
- // Returns empty if no available actions in the table.
- llvm::ArrayRef<Action> find(StateID State, SymbolID Symbol) const;
-
- size_t bytes() const {
- return sizeof(*this) + Actions.capacity() * sizeof(Action) +
- States.capacity() * sizeof(StateID) +
- NontermOffset.capacity() * sizeof(uint32_t) +
- TerminalOffset.capacity() * sizeof(uint32_t);
- }
-
- std::string dumpStatistics() const;
- std::string dumpForTests(const Grammar &G) const;
-
- // Build a SLR(1) parsing table.
- static LRTable buildSLR(const Grammar &G);
-
- class Builder;
- // Represents an entry in the table, used for building the LRTable.
- struct Entry {
- StateID State;
- SymbolID Symbol;
- Action Act;
- };
- // Build a specifid table for testing purposes.
- static LRTable buildForTests(const GrammarTable &, llvm::ArrayRef<Entry>);
-
-private:
- // Conceptually the LR table is a multimap from (State, SymbolID) => Action.
- // Our physical representation is quite different for compactness.
-
- // Index is nonterminal SymbolID, value is the offset into States/Actions
- // where the entries for this nonterminal begin.
- // Give a non-terminal id, the corresponding half-open range of StateIdx is
- // [NontermIdx[id], NontermIdx[id+1]).
- std::vector<uint32_t> NontermOffset;
- // Similar to NontermOffset, but for terminals, index is tok::TokenKind.
- std::vector<uint32_t> TerminalOffset;
- // Parallel to Actions, the value is State (rows of the matrix).
- // Grouped by the SymbolID, and only subranges are sorted.
- std::vector<StateID> States;
- // A flat list of available actions, sorted by (SymbolID, State).
- std::vector<Action> Actions;
-};
-llvm::raw_ostream &operator<<(llvm::raw_ostream &, const LRTable::Action &);
-
-} // namespace pseudo
-} // namespace syntax
-} // namespace clang
-
-#endif // LLVM_CLANG_TOOLING_SYNTAX_PSEUDO_LRTABLE_H
+++ /dev/null
-//===--- Token.h - Tokens and token streams in the pseudoparser --*- C++-*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Tokens are the first level of abstraction above bytes used in pseudoparsing.
-// We use clang's lexer to scan the bytes (in raw mode, with no preprocessor).
-// The tokens is wrapped into pseudo::Token, along with line/indent info.
-//
-// Unlike clang, we make multiple passes over the whole file, out-of-order.
-// Therefore we retain the whole token sequence in memory. (This is feasible as
-// we process one file at a time). pseudo::TokenStream holds such a stream.
-// The initial stream holds the raw tokens read from the file, later passes
-// operate on derived TokenStreams (e.g. with directives stripped).
-//
-// Similar facilities from clang that are *not* used:
-// - SourceManager: designed around multiple files and precise macro expansion.
-// - clang::Token: coupled to SourceManager, doesn't retain layout info.
-// (pseudo::Token is similar, but without SourceLocations).
-// - syntax::TokenBuffer: coupled to SourceManager, has #includes and macros.
-// (pseudo::TokenStream is similar, but a flat token list).
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_CLANG_TOOLING_SYNTAX_TOKEN_H
-#define LLVM_CLANG_TOOLING_SYNTAX_TOKEN_H
-
-#include "clang/Basic/LLVM.h"
-#include "clang/Basic/TokenKinds.h"
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/Support/raw_ostream.h"
-#include <cstdint>
-#include <limits>
-#include <memory>
-#include <vector>
-
-namespace clang {
-class LangOptions;
-namespace syntax {
-namespace pseudo {
-
-/// A single C++ or preprocessor token.
-///
-/// Unlike clang::Token and syntax::Token, these tokens are not connected to a
-/// SourceManager - we are not dealing with multiple files.
-struct Token {
- /// An Index identifies a token within a stream.
- using Index = uint32_t;
- /// A sentinel Index indicating no token.
- constexpr static Index Invalid = std::numeric_limits<Index>::max();
- struct Range;
-
- /// The token text.
- ///
- /// Typically from the original source file, but may have been synthesized.
- StringRef text() const { return StringRef(Data, Length); }
- const char *Data = nullptr;
- uint32_t Length = 0;
-
- /// Zero-based line number for the start of the token.
- /// This refers to the original source file as written.
- uint32_t Line = 0;
- /// Width of whitespace before the first token on this line.
- uint8_t Indent = 0;
- /// Flags have some meaning defined by the function that produced this stream.
- uint8_t Flags = 0;
- // Helpers to get/set Flags based on `enum class`.
- template <class T> bool flag(T Mask) const {
- return Flags & uint8_t{static_cast<std::underlying_type_t<T>>(Mask)};
- }
- template <class T> void setFlag(T Mask) {
- Flags |= uint8_t{static_cast<std::underlying_type_t<T>>(Mask)};
- }
-
- /// The type of token as determined by clang's lexer.
- clang::tok::TokenKind Kind = clang::tok::unknown;
-};
-static_assert(sizeof(Token) <= sizeof(char *) + 16, "Careful with layout!");
-llvm::raw_ostream &operator<<(llvm::raw_ostream &, const Token &);
-
-/// A half-open range of tokens within a stream.
-struct Token::Range {
- Index Begin = 0;
- Index End = 0;
-
- uint32_t size() const { return End - Begin; }
- static Range emptyAt(Index Index) { return Range{Index, Index}; }
-};
-llvm::raw_ostream &operator<<(llvm::raw_ostream &, const Token::Range &);
-
-/// A complete sequence of Tokens representing a source file.
-///
-/// This may match a raw file from disk, or be derived from a previous stream.
-/// For example, stripping comments from a TokenStream results in a new stream.
-///
-/// A stream has sentinel 'eof' tokens at each end, e.g `int main();` becomes:
-/// int main ( ) ;
-/// eof kw_int ident l_paren r_paren semi eof
-/// front() back()
-/// 0 1 2 3 4 5
-class TokenStream {
-public:
- /// Create an empty stream.
- ///
- /// Initially, the stream is appendable and not finalized.
- /// The token sequence may only be accessed after finalize() is called.
- ///
- /// Payload is an opaque object which will be owned by the stream.
- /// e.g. an allocator to hold backing storage for synthesized token text.
- explicit TokenStream(std::shared_ptr<void> Payload = nullptr);
-
- /// Append a token to the stream, which must not be finalized.
- void push(Token T) {
- assert(!isFinalized());
- Storage.push_back(std::move(T));
- }
-
- /// Finalize the token stream, allowing tokens to be accessed.
- /// Tokens may no longer be appended.
- void finalize();
- bool isFinalized() const;
-
- /// Returns the index of T within the stream.
- ///
- /// T must be within the stream or the end sentinel (not the start sentinel).
- Token::Index index(const Token &T) const {
- assert(isFinalized());
- assert(&T >= Storage.data() && &T < Storage.data() + Storage.size());
- assert(&T != Storage.data() && "start sentinel");
- return &T - Tokens.data();
- }
-
- ArrayRef<Token> tokens() const {
- assert(isFinalized());
- return Tokens;
- }
- ArrayRef<Token> tokens(Token::Range R) const {
- return tokens().slice(R.Begin, R.End - R.Begin);
- }
-
- /// May return the end sentinel if the stream is empty.
- const Token &front() const {
- assert(isFinalized());
- return Storage[1];
- }
-
- /// Print the tokens in this stream to the output stream.
- ///
- /// The presence of newlines/spaces is preserved, but not the quantity.
- void print(llvm::raw_ostream &) const;
-
-private:
- std::shared_ptr<void> Payload;
-
- MutableArrayRef<Token> Tokens;
- std::vector<Token> Storage; // eof + Tokens + eof
-};
-llvm::raw_ostream &operator<<(llvm::raw_ostream &, const TokenStream &);
-
-/// Extracts a raw token stream from the source code.
-///
-/// All tokens will reference the data of the provided string.
-/// "word-like" tokens such as identifiers and keywords will be raw_identifier.
-TokenStream lex(const std::string &, const clang::LangOptions &);
-enum class LexFlags : uint8_t {
- /// Marks the token at the start of a logical preprocessor line.
- /// This is a position where a directive might start.
- ///
- /// Here, the first # is StartsPPLine, but second is not (same logical line).
- /// #define X(error) \
- /// #error // not a directive!
- ///
- /// Careful, the directive may not start exactly on the StartsPPLine token:
- /// /*comment*/ #include <foo.h>
- StartsPPLine = 1 << 0,
- /// Marks tokens containing trigraphs, escaped newlines, UCNs etc.
- /// The text() of such tokens will contain the raw trigrah.
- NeedsCleaning = 1 << 1,
-};
-
-/// Derives a token stream by decoding escapes and interpreting raw_identifiers.
-///
-/// Tokens containing UCNs, escaped newlines, trigraphs etc are decoded and
-/// their backing data is owned by the returned stream.
-/// raw_identifier tokens are assigned specific types (identifier, keyword etc).
-///
-/// The StartsPPLine flag is preserved.
-///
-/// Formally the identifier correctly happens before preprocessing, while we
-/// should only cook raw_identifiers that survive preprocessing.
-/// However, ignoring the Token::Kind of tokens in directives achieves the same.
-/// (And having cooked token kinds in PP-disabled sections is useful for us).
-TokenStream cook(const TokenStream &, const clang::LangOptions &);
-
-/// Drops comment tokens.
-TokenStream stripComments(const TokenStream &);
-
-} // namespace pseudo
-} // namespace syntax
-} // namespace clang
-
-#endif
DEPENDS
omp_gen
)
-
-add_subdirectory(Pseudo)
+++ /dev/null
-set(LLVM_LINK_COMPONENTS Support)
-
-add_clang_library(clangToolingSyntaxPseudo
- DirectiveMap.cpp
- Grammar.cpp
- GrammarBNF.cpp
- Lex.cpp
- LRGraph.cpp
- LRTable.cpp
- LRTableBuild.cpp
- Token.cpp
-
- LINK_LIBS
- clangBasic
- clangLex
- )
+++ /dev/null
-//===--- DirectiveMap.cpp - Find and strip preprocessor directives --------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "clang/Tooling/Syntax/Pseudo/DirectiveMap.h"
-#include "clang/Basic/IdentifierTable.h"
-#include "clang/Basic/TokenKinds.h"
-#include "llvm/Support/FormatVariadic.h"
-
-namespace clang {
-namespace syntax {
-namespace pseudo {
-namespace {
-
-class DirectiveParser {
-public:
- explicit DirectiveParser(const TokenStream &Code)
- : Code(Code), Tok(&Code.front()) {}
- void parse(DirectiveMap *Result) { parse(Result, /*TopLevel=*/true); }
-
-private:
- // Roles that a directive might take within a conditional block.
- enum class Cond { None, If, Else, End };
- static Cond classifyDirective(tok::PPKeywordKind K) {
- switch (K) {
- case clang::tok::pp_if:
- case clang::tok::pp_ifdef:
- case clang::tok::pp_ifndef:
- return Cond::If;
- case clang::tok::pp_elif:
- case clang::tok::pp_elifdef:
- case clang::tok::pp_elifndef:
- case clang::tok::pp_else:
- return Cond::Else;
- case clang::tok::pp_endif:
- return Cond::End;
- default:
- return Cond::None;
- }
- }
-
- // Parses tokens starting at Tok into Map.
- // If we reach an End or Else directive that ends Map, returns it.
- // If TopLevel is true, then we do not expect End and always return None.
- llvm::Optional<DirectiveMap::Directive> parse(DirectiveMap *Map,
- bool TopLevel) {
- auto StartsDirective =
- [&, AllowDirectiveAt((const Token *)nullptr)]() mutable {
- if (Tok->flag(LexFlags::StartsPPLine)) {
- // If we considered a comment at the start of a PP-line, it doesn't
- // start a directive but the directive can still start after it.
- if (Tok->Kind == tok::comment)
- AllowDirectiveAt = Tok + 1;
- return Tok->Kind == tok::hash;
- }
- return Tok->Kind == tok::hash && AllowDirectiveAt == Tok;
- };
- // Each iteration adds one chunk (or returns, if we see #endif).
- while (Tok->Kind != tok::eof) {
- // If there's no directive here, we have a code chunk.
- if (!StartsDirective()) {
- const Token *Start = Tok;
- do
- ++Tok;
- while (Tok->Kind != tok::eof && !StartsDirective());
- Map->Chunks.push_back(DirectiveMap::Code{
- Token::Range{Code.index(*Start), Code.index(*Tok)}});
- continue;
- }
-
- // We have some kind of directive.
- DirectiveMap::Directive Directive;
- parseDirective(&Directive);
- Cond Kind = classifyDirective(Directive.Kind);
- if (Kind == Cond::If) {
- // #if or similar, starting a nested conditional block.
- DirectiveMap::Conditional Conditional;
- Conditional.Branches.emplace_back();
- Conditional.Branches.back().first = std::move(Directive);
- parseConditional(&Conditional);
- Map->Chunks.push_back(std::move(Conditional));
- } else if ((Kind == Cond::Else || Kind == Cond::End) && !TopLevel) {
- // #endif or similar, ending this PStructure scope.
- // (#endif is unexpected at the top level, treat as simple directive).
- return std::move(Directive);
- } else {
- // #define or similar, a simple directive at the current scope.
- Map->Chunks.push_back(std::move(Directive));
- }
- }
- return None;
- }
-
- // Parse the rest of a conditional section, after seeing the If directive.
- // Returns after consuming the End directive.
- void parseConditional(DirectiveMap::Conditional *C) {
- assert(C->Branches.size() == 1 &&
- C->Branches.front().second.Chunks.empty() &&
- "Should be ready to parse first branch body");
- while (Tok->Kind != tok::eof) {
- auto Terminator = parse(&C->Branches.back().second, /*TopLevel=*/false);
- if (!Terminator) {
- assert(Tok->Kind == tok::eof && "gave up parsing before eof?");
- C->End.Tokens = Token::Range::emptyAt(Code.index(*Tok));
- return;
- }
- if (classifyDirective(Terminator->Kind) == Cond::End) {
- C->End = std::move(*Terminator);
- return;
- }
- assert(classifyDirective(Terminator->Kind) == Cond::Else &&
- "ended branch unexpectedly");
- C->Branches.emplace_back();
- C->Branches.back().first = std::move(*Terminator);
- }
- }
-
- // Parse a directive. Tok is the hash.
- void parseDirective(DirectiveMap::Directive *D) {
- assert(Tok->Kind == tok::hash);
-
- // Directive spans from the hash until the end of line or file.
- const Token *Begin = Tok++;
- while (Tok->Kind != tok::eof && !Tok->flag(LexFlags::StartsPPLine))
- ++Tok;
- ArrayRef<Token> Tokens{Begin, Tok};
- D->Tokens = {Code.index(*Tokens.begin()), Code.index(*Tokens.end())};
-
- // Directive name is the first non-comment token after the hash.
- Tokens = Tokens.drop_front().drop_while(
- [](const Token &T) { return T.Kind == tok::comment; });
- if (!Tokens.empty())
- D->Kind = PPKeywords.get(Tokens.front().text()).getPPKeywordID();
- }
-
- const TokenStream &Code;
- const Token *Tok;
- clang::IdentifierTable PPKeywords;
-};
-
-} // namespace
-
-DirectiveMap DirectiveMap::parse(const TokenStream &Code) {
- DirectiveMap Result;
- DirectiveParser(Code).parse(&Result);
- return Result;
-}
-
-static void dump(llvm::raw_ostream &OS, const DirectiveMap &, unsigned Indent);
-static void dump(llvm::raw_ostream &OS, const DirectiveMap::Directive &Directive,
- unsigned Indent) {
- OS.indent(Indent) << llvm::formatv("#{0} ({1} tokens)\n",
- tok::getPPKeywordSpelling(Directive.Kind),
- Directive.Tokens.size());
-}
-static void dump(llvm::raw_ostream &OS, const DirectiveMap::Code &Code,
- unsigned Indent) {
- OS.indent(Indent) << llvm::formatv("code ({0} tokens)\n", Code.Tokens.size());
-}
-static void dump(llvm::raw_ostream &OS,
- const DirectiveMap::Conditional &Conditional,
- unsigned Indent) {
- for (const auto &Branch : Conditional.Branches) {
- dump(OS, Branch.first, Indent);
- dump(OS, Branch.second, Indent + 2);
- }
- dump(OS, Conditional.End, Indent);
-}
-
-static void dump(llvm::raw_ostream &OS, const DirectiveMap::Chunk &Chunk,
- unsigned Indent) {
- switch (Chunk.kind()) {
- case DirectiveMap::Chunk::K_Empty:
- llvm_unreachable("invalid chunk");
- case DirectiveMap::Chunk::K_Code:
- return dump(OS, (const DirectiveMap::Code &)Chunk, Indent);
- case DirectiveMap::Chunk::K_Directive:
- return dump(OS, (const DirectiveMap::Directive &)Chunk, Indent);
- case DirectiveMap::Chunk::K_Conditional:
- return dump(OS, (const DirectiveMap::Conditional &)Chunk, Indent);
- }
-}
-
-static void dump(llvm::raw_ostream &OS, const DirectiveMap &Map,
- unsigned Indent) {
- for (const auto &Chunk : Map.Chunks)
- dump(OS, Chunk, Indent);
-}
-
-// Define operator<< in terms of dump() functions above.
-#define OSTREAM_DUMP(Type) \
- llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const Type &T) { \
- dump(OS, T, 0); \
- return OS; \
- }
-OSTREAM_DUMP(DirectiveMap)
-OSTREAM_DUMP(DirectiveMap::Chunk)
-OSTREAM_DUMP(DirectiveMap::Directive)
-OSTREAM_DUMP(DirectiveMap::Conditional)
-OSTREAM_DUMP(DirectiveMap::Code)
-#undef OSTREAM_DUMP
-
-} // namespace pseudo
-} // namespace syntax
-} // namespace clang
+++ /dev/null
-//===--- Grammar.cpp - Grammar for clang pseudo parser ----------*- C++-*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "clang/Tooling/Syntax/Pseudo/Grammar.h"
-#include "clang/Basic/TokenKinds.h"
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/Support/FormatVariadic.h"
-#include "llvm/Support/raw_ostream.h"
-
-namespace clang {
-namespace syntax {
-namespace pseudo {
-
-Rule::Rule(SymbolID Target, llvm::ArrayRef<SymbolID> Sequence)
- : Target(Target), Size(static_cast<uint8_t>(Sequence.size())) {
- assert(Sequence.size() <= Rule::MaxElements);
- llvm::copy(Sequence, this->Sequence);
-}
-
-Grammar::Grammar(std::unique_ptr<GrammarTable> Table) : T(std::move(Table)) {
- // start symbol is named _, binary search it.
- auto It = llvm::partition_point(
- T->Nonterminals,
- [](const GrammarTable::Nonterminal &X) { return X.Name < "_"; });
- assert(It != T->Nonterminals.end() && It->Name == "_" &&
- "symbol _ must exist in the grammar!");
- StartSymbol = It - T->Nonterminals.begin();
-}
-
-llvm::ArrayRef<Rule> Grammar::rulesFor(SymbolID SID) const {
- assert(isNonterminal(SID));
- const auto &R = T->Nonterminals[SID].RuleRange;
- assert(R.end <= T->Rules.size());
- return llvm::makeArrayRef(&T->Rules[R.start], R.end - R.start);
-}
-
-const Rule &Grammar::lookupRule(RuleID RID) const {
- assert(RID < T->Rules.size());
- return T->Rules[RID];
-}
-
-llvm::StringRef Grammar::symbolName(SymbolID SID) const {
- if (isToken(SID))
- return T->Terminals[symbolToToken(SID)];
- return T->Nonterminals[SID].Name;
-}
-
-std::string Grammar::dumpRule(RuleID RID) const {
- std::string Result;
- llvm::raw_string_ostream OS(Result);
- const Rule &R = T->Rules[RID];
- OS << symbolName(R.Target) << " :=";
- for (SymbolID SID : R.seq())
- OS << " " << symbolName(SID);
- return Result;
-}
-
-std::string Grammar::dumpRules(SymbolID SID) const {
- assert(isNonterminal(SID));
- std::string Result;
- const auto &Range = T->Nonterminals[SID].RuleRange;
- for (RuleID RID = Range.start; RID < Range.end; ++RID)
- Result.append(dumpRule(RID)).push_back('\n');
- return Result;
-}
-
-std::string Grammar::dump() const {
- std::string Result;
- llvm::raw_string_ostream OS(Result);
- OS << "Nonterminals:\n";
- for (SymbolID SID = 0; SID < T->Nonterminals.size(); ++SID)
- OS << llvm::formatv(" {0} {1}\n", SID, symbolName(SID));
- OS << "Rules:\n";
- for (RuleID RID = 0; RID < T->Rules.size(); ++RID)
- OS << llvm::formatv(" {0} {1}\n", RID, dumpRule(RID));
- return OS.str();
-}
-
-std::vector<llvm::DenseSet<SymbolID>> firstSets(const Grammar &G) {
- std::vector<llvm::DenseSet<SymbolID>> FirstSets(
- G.table().Nonterminals.size());
- auto ExpandFirstSet = [&FirstSets](SymbolID Target, SymbolID First) {
- assert(isNonterminal(Target));
- if (isToken(First))
- return FirstSets[Target].insert(First).second;
- bool Changed = false;
- for (SymbolID SID : FirstSets[First])
- Changed |= FirstSets[Target].insert(SID).second;
- return Changed;
- };
-
- // A rule S := T ... implies elements in FIRST(S):
- // - if T is a terminal, FIRST(S) contains T
- // - if T is a nonterminal, FIRST(S) contains FIRST(T)
- // Since FIRST(T) may not have been fully computed yet, FIRST(S) itself may
- // end up being incomplete.
- // We iterate until we hit a fixed point.
- // (This isn't particularly efficient, but table building isn't on the
- // critical path).
- bool Changed = true;
- while (Changed) {
- Changed = false;
- for (const auto &R : G.table().Rules)
- // We only need to consider the first element because symbols are
- // non-nullable.
- Changed |= ExpandFirstSet(R.Target, R.seq().front());
- }
- return FirstSets;
-}
-
-std::vector<llvm::DenseSet<SymbolID>> followSets(const Grammar &G) {
- auto FirstSets = firstSets(G);
- std::vector<llvm::DenseSet<SymbolID>> FollowSets(
- G.table().Nonterminals.size());
- // Expand the follow set of a non-terminal symbol Y by adding all from the
- // given symbol set.
- auto ExpandFollowSet = [&FollowSets](SymbolID Y,
- const llvm::DenseSet<SymbolID> &ToAdd) {
- assert(isNonterminal(Y));
- bool Changed = false;
- for (SymbolID F : ToAdd)
- Changed |= FollowSets[Y].insert(F).second;
- return Changed;
- };
- // Follow sets is computed based on the following 3 rules, the computation
- // is completed at a fixed point where there is no more new symbols can be
- // added to any of the follow sets.
- //
- // Rule 1: add endmarker to the FOLLOW(S), where S is the start symbol.
- FollowSets[G.startSymbol()].insert(tokenSymbol(tok::eof));
- bool Changed = true;
- while (Changed) {
- Changed = false;
- for (const auto &R : G.table().Rules) {
- // Rule 2: for a rule X := ... Y Z, we add all symbols from FIRST(Z) to
- // FOLLOW(Y).
- for (size_t i = 0; i + 1 < R.seq().size(); ++i) {
- if (isToken(R.seq()[i]))
- continue;
- // We only need to consider the next symbol because symbols are
- // non-nullable.
- SymbolID Next = R.seq()[i + 1];
- if (isToken(Next))
- // First set for a terminal is itself.
- Changed |= ExpandFollowSet(R.seq()[i], {Next});
- else
- Changed |= ExpandFollowSet(R.seq()[i], FirstSets[Next]);
- }
- // Rule 3: for a rule X := ... Z, we add all symbols from FOLLOW(X) to
- // FOLLOW(Z).
- SymbolID Z = R.seq().back();
- if (isNonterminal(Z))
- Changed |= ExpandFollowSet(Z, FollowSets[R.Target]);
- }
- }
- return FollowSets;
-}
-
-static llvm::ArrayRef<std::string> getTerminalNames() {
- static const std::vector<std::string> *TerminalNames = []() {
- static std::vector<std::string> TerminalNames;
- TerminalNames.reserve(NumTerminals);
- for (unsigned I = 0; I < NumTerminals; ++I) {
- tok::TokenKind K = static_cast<tok::TokenKind>(I);
- if (const auto *Punc = tok::getPunctuatorSpelling(K))
- TerminalNames.push_back(Punc);
- else
- TerminalNames.push_back(llvm::StringRef(tok::getTokenName(K)).upper());
- }
- return &TerminalNames;
- }();
- return *TerminalNames;
-}
-GrammarTable::GrammarTable() : Terminals(getTerminalNames()) {}
-
-} // namespace pseudo
-} // namespace syntax
-} // namespace clang
+++ /dev/null
-//===--- GrammarBNF.cpp - build grammar from BNF files ----------*- C++-*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "clang/Basic/TokenKinds.h"
-#include "clang/Tooling/Syntax/Pseudo/Grammar.h"
-#include "llvm/ADT/DenseSet.h"
-#include "llvm/ADT/StringExtras.h"
-#include "llvm/Support/FormatVariadic.h"
-#include <memory>
-
-namespace clang {
-namespace syntax {
-namespace pseudo {
-
-namespace {
-static const llvm::StringRef OptSuffix = "_opt";
-static const llvm::StringRef StartSymbol = "_";
-
-// Builds grammar from BNF files.
-class GrammarBuilder {
-public:
- GrammarBuilder(std::vector<std::string> &Diagnostics)
- : Diagnostics(Diagnostics) {}
-
- std::unique_ptr<Grammar> build(llvm::StringRef BNF) {
- auto Specs = eliminateOptional(parse(BNF));
-
- assert(llvm::all_of(Specs,
- [](const RuleSpec &R) {
- if (R.Target.endswith(OptSuffix))
- return false;
- return llvm::all_of(
- R.Sequence, [](const RuleSpec::Element &E) {
- return !E.Symbol.endswith(OptSuffix);
- });
- }) &&
- "Optional symbols should be eliminated!");
-
- auto T = std::make_unique<GrammarTable>();
-
- // Assemble the name->ID and ID->nonterminal name maps.
- llvm::DenseSet<llvm::StringRef> UniqueNonterminals;
- llvm::DenseMap<llvm::StringRef, SymbolID> SymbolIds;
- for (uint16_t I = 0; I < NumTerminals; ++I)
- SymbolIds.try_emplace(T->Terminals[I], tokenSymbol(tok::TokenKind(I)));
- auto Consider = [&](llvm::StringRef Name) {
- if (!SymbolIds.count(Name))
- UniqueNonterminals.insert(Name);
- };
- for (const auto &Spec : Specs) {
- Consider(Spec.Target);
- for (const RuleSpec::Element &Elt : Spec.Sequence)
- Consider(Elt.Symbol);
- }
- llvm::for_each(UniqueNonterminals, [&T](llvm::StringRef Name) {
- T->Nonterminals.emplace_back();
- T->Nonterminals.back().Name = Name.str();
- });
- assert(T->Nonterminals.size() < (1 << (SymbolBits - 1)) &&
- "Too many nonterminals to fit in SymbolID bits!");
- llvm::sort(T->Nonterminals, [](const GrammarTable::Nonterminal &L,
- const GrammarTable::Nonterminal &R) {
- return L.Name < R.Name;
- });
- // Build name -> ID maps for nonterminals.
- for (SymbolID SID = 0; SID < T->Nonterminals.size(); ++SID)
- SymbolIds.try_emplace(T->Nonterminals[SID].Name, SID);
-
- // Convert the rules.
- T->Rules.reserve(Specs.size());
- std::vector<SymbolID> Symbols;
- auto Lookup = [SymbolIds](llvm::StringRef Name) {
- auto It = SymbolIds.find(Name);
- assert(It != SymbolIds.end() && "Didn't find the symbol in SymbolIds!");
- return It->second;
- };
- for (const auto &Spec : Specs) {
- assert(Spec.Sequence.size() <= Rule::MaxElements);
- Symbols.clear();
- for (const RuleSpec::Element &Elt : Spec.Sequence)
- Symbols.push_back(Lookup(Elt.Symbol));
- T->Rules.push_back(Rule(Lookup(Spec.Target), Symbols));
- }
- assert(T->Rules.size() < (1 << RuleBits) &&
- "Too many rules to fit in RuleID bits!");
- llvm::sort(T->Rules, [](const Rule &Left, const Rule &Right) {
- // Sorted by the Target.
- return std::tie(Left.Target, Left.Size) <
- std::tie(Right.Target, Right.Size);
- });
- RuleID RulePos = 0;
- for (SymbolID SID = 0; SID < T->Nonterminals.size(); ++SID) {
- RuleID Start = RulePos;
- while (RulePos < T->Rules.size() && T->Rules[RulePos].Target == SID)
- ++RulePos;
- T->Nonterminals[SID].RuleRange = {Start, RulePos};
- }
- auto G = std::make_unique<Grammar>(std::move(T));
- diagnoseGrammar(*G);
- return G;
- }
-
-private:
- // Text representation of a BNF grammar rule.
- struct RuleSpec {
- llvm::StringRef Target;
- struct Element {
- llvm::StringRef Symbol; // Name of the symbol
- };
- std::vector<Element> Sequence;
-
- std::string toString() const {
- std::vector<llvm::StringRef> Body;
- for (const auto &E : Sequence)
- Body.push_back(E.Symbol);
- return llvm::formatv("{0} := {1}", Target, llvm::join(Body, " "));
- }
- };
-
- std::vector<RuleSpec> parse(llvm::StringRef Lines) {
- std::vector<RuleSpec> Specs;
- for (llvm::StringRef Line : llvm::split(Lines, '\n')) {
- Line = Line.trim();
- // Strip anything coming after the '#' (comment).
- Line = Line.take_while([](char C) { return C != '#'; });
- if (Line.empty())
- continue;
- RuleSpec Rule;
- if (parseLine(Line, Rule))
- Specs.push_back(std::move(Rule));
- }
- return Specs;
- }
-
- bool parseLine(llvm::StringRef Line, RuleSpec &Out) {
- auto Parts = Line.split(":=");
- if (Parts.first == Line) { // no separator in Line
- Diagnostics.push_back(
- llvm::formatv("Failed to parse '{0}': no separator :=", Line).str());
- return false;
- }
-
- Out.Target = Parts.first.trim();
- Out.Sequence.clear();
- for (llvm::StringRef Chunk : llvm::split(Parts.second, ' ')) {
- Chunk = Chunk.trim();
- if (Chunk.empty())
- continue; // skip empty
-
- Out.Sequence.push_back({Chunk});
- }
- return true;
- };
-
- // Inlines all _opt symbols.
- // For example, a rule E := id +_opt id, after elimination, we have two
- // equivalent rules:
- // 1) E := id + id
- // 2) E := id id
- std::vector<RuleSpec> eliminateOptional(llvm::ArrayRef<RuleSpec> Input) {
- std::vector<RuleSpec> Results;
- std::vector<RuleSpec::Element> Storage;
- for (const auto &R : Input) {
- eliminateOptionalTail(
- R.Sequence, Storage, [&Results, &Storage, &R, this]() {
- if (Storage.empty()) {
- Diagnostics.push_back(
- llvm::formatv("Rule '{0}' has a nullable RHS", R.toString()));
- return;
- }
- Results.push_back({R.Target, Storage});
- });
- assert(Storage.empty());
- }
- return Results;
- }
- void eliminateOptionalTail(llvm::ArrayRef<RuleSpec::Element> Elements,
- std::vector<RuleSpec::Element> &Result,
- llvm::function_ref<void()> CB) {
- if (Elements.empty())
- return CB();
- auto Front = Elements.front();
- if (!Front.Symbol.endswith(OptSuffix)) {
- Result.push_back(std::move(Front));
- eliminateOptionalTail(Elements.drop_front(1), Result, CB);
- Result.pop_back();
- return;
- }
- // Enumerate two options: skip the opt symbol, or inline the symbol.
- eliminateOptionalTail(Elements.drop_front(1), Result, CB); // skip
- Front.Symbol = Front.Symbol.drop_back(OptSuffix.size()); // drop "_opt"
- Result.push_back(std::move(Front));
- eliminateOptionalTail(Elements.drop_front(1), Result, CB);
- Result.pop_back();
- }
-
- // Diagnoses the grammar and emit warnings if any.
- void diagnoseGrammar(const Grammar &G) {
- const auto &T = G.table();
- for (SymbolID SID = 0; SID < T.Nonterminals.size(); ++SID) {
- auto Range = T.Nonterminals[SID].RuleRange;
- if (Range.start == Range.end)
- Diagnostics.push_back(
- llvm::formatv("No rules for nonterminal: {0}", G.symbolName(SID)));
- llvm::StringRef NameRef = T.Nonterminals[SID].Name;
- if (llvm::all_of(NameRef, llvm::isAlpha) && NameRef.upper() == NameRef) {
- Diagnostics.push_back(llvm::formatv(
- "Token-like name {0} is used as a nonterminal", G.symbolName(SID)));
- }
- }
- for (RuleID RID = 0; RID + 1u < T.Rules.size(); ++RID) {
- if (T.Rules[RID] == T.Rules[RID + 1])
- Diagnostics.push_back(
- llvm::formatv("Duplicate rule: `{0}`", G.dumpRule(RID)));
- // Warning for nullable non-terminals
- if (T.Rules[RID].Size == 0)
- Diagnostics.push_back(
- llvm::formatv("Rule `{0}` has a nullable RHS", G.dumpRule(RID)));
- }
- // symbol-id -> used counts
- std::vector<unsigned> UseCounts(T.Nonterminals.size(), 0);
- for (const Rule &R : T.Rules)
- for (SymbolID SID : R.seq())
- if (isNonterminal(SID))
- ++UseCounts[SID];
- for (SymbolID SID = 0; SID < UseCounts.size(); ++SID)
- if (UseCounts[SID] == 0 && T.Nonterminals[SID].Name != StartSymbol)
- Diagnostics.push_back(
- llvm::formatv("Nonterminal never used: {0}", G.symbolName(SID)));
- }
- std::vector<std::string> &Diagnostics;
-};
-} // namespace
-
-std::unique_ptr<Grammar>
-Grammar::parseBNF(llvm::StringRef BNF, std::vector<std::string> &Diagnostics) {
- Diagnostics.clear();
- return GrammarBuilder(Diagnostics).build(BNF);
-}
-
-} // namespace pseudo
-} // namespace syntax
-} // namespace clang
+++ /dev/null
-//===--- LRGraph.cpp - -------------------------------------------*- C++-*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "clang/Tooling/Syntax/Pseudo/LRGraph.h"
-#include "clang/Tooling/Syntax/Pseudo/Grammar.h"
-#include "llvm/ADT/DenseSet.h"
-#include "llvm/ADT/Hashing.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/StringExtras.h"
-#include "llvm/Support/FormatVariadic.h"
-#include "llvm/Support/raw_ostream.h"
-
-using ItemSet = std::vector<clang::syntax::pseudo::Item>;
-
-namespace llvm {
-// Support clang::syntax::pseudo::Item as DenseMap keys.
-template <> struct DenseMapInfo<ItemSet> {
- static inline ItemSet getEmptyKey() {
- return {DenseMapInfo<clang::syntax::pseudo::Item>::getEmptyKey()};
- }
- static inline ItemSet getTombstoneKey() {
- return {DenseMapInfo<clang::syntax::pseudo::Item>::getTombstoneKey()};
- }
- static unsigned getHashValue(const ItemSet &I) {
- return llvm::hash_combine_range(I.begin(), I.end());
- }
- static bool isEqual(const ItemSet &LHS, const ItemSet &RHS) {
- return LHS == RHS;
- }
-};
-} // namespace llvm
-
-namespace clang {
-namespace syntax {
-namespace pseudo {
-namespace {
-
-struct SortByNextSymbol {
- SortByNextSymbol(const Grammar &G) : G(G) {}
- bool operator()(const Item &L, const Item &R) {
- if (L.hasNext() && R.hasNext() && L.next(G) != R.next(G))
- return L.next(G) < R.next(G);
- if (L.hasNext() != R.hasNext())
- return L.hasNext() < R.hasNext(); // a trailing dot is minimal.
- return L < R;
- }
- const Grammar &G;
-};
-
-// Computes a closure of the given item set S:
-// - extends the given S to contain all options for parsing next token;
-// - nonterminals after a dot are recursively expanded into the begin-state
-// of all production rules that produce that nonterminal;
-//
-// Given
-// Grammar rules = [ _ := E, E := E - T, E := T, T := n, T := ( E ) ]
-// Input = [ E := . T ]
-// returns [ E := . T, T := . n, T := . ( E ) ]
-State closure(ItemSet Queue, const Grammar &G) {
- llvm::DenseSet<Item> InQueue = {Queue.begin(), Queue.end()};
- // We reuse the passed-by-value Queue as the final result, as it's already
- // initialized to the right elements.
- size_t ItIndex = 0;
- while (ItIndex < Queue.size()) {
- const Item &ExpandingItem = Queue[ItIndex];
- ++ItIndex;
- if (!ExpandingItem.hasNext())
- continue;
-
- SymbolID NextSym = ExpandingItem.next(G);
- if (pseudo::isToken(NextSym))
- continue;
- auto RRange = G.table().Nonterminals[NextSym].RuleRange;
- for (RuleID RID = RRange.start; RID < RRange.end; ++RID) {
- Item NewItem = Item::start(RID, G);
- if (InQueue.insert(NewItem).second) // new
- Queue.push_back(std::move(NewItem));
- }
- }
- Queue.shrink_to_fit();
- llvm::sort(Queue, SortByNextSymbol(G));
- return {std::move(Queue)};
-}
-
-// Returns all next (with a dot advanced) kernel item sets, partitioned by the
-// advanced symbol.
-//
-// Given
-// S = [ E := . a b, E := E . - T ]
-// returns [
-// {id(a), [ E := a . b ]},
-// {id(-), [ E := E - . T ]}
-// ]
-std::vector<std::pair<SymbolID, ItemSet>>
-nextAvailableKernelItems(const State &S, const Grammar &G) {
- std::vector<std::pair<SymbolID, ItemSet>> Results;
- llvm::ArrayRef<Item> AllItems = S.Items;
- AllItems = AllItems.drop_while([](const Item &I) { return !I.hasNext(); });
- while (!AllItems.empty()) {
- SymbolID AdvancedSymbol = AllItems.front().next(G);
- auto Batch = AllItems.take_while([AdvancedSymbol, &G](const Item &I) {
- assert(I.hasNext());
- return I.next(G) == AdvancedSymbol;
- });
- assert(!Batch.empty());
- AllItems = AllItems.drop_front(Batch.size());
-
- // Advance a dot over the Symbol.
- ItemSet Next;
- for (const Item &I : Batch)
- Next.push_back(I.advance());
- // sort the set to keep order determinism for hash computation.
- llvm::sort(Next);
- Results.push_back({AdvancedSymbol, std::move(Next)});
- }
- return Results;
-}
-
-} // namespace
-
-std::string Item::dump(const Grammar &G) const {
- const auto &Rule = G.lookupRule(RID);
- auto ToNames = [&](llvm::ArrayRef<SymbolID> Syms) {
- std::vector<llvm::StringRef> Results;
- for (auto SID : Syms)
- Results.push_back(G.symbolName(SID));
- return Results;
- };
- return llvm::formatv("{0} := {1} • {2}", G.symbolName(Rule.Target),
- llvm::join(ToNames(Rule.seq().take_front(DotPos)), " "),
- llvm::join(ToNames(Rule.seq().drop_front(DotPos)), " "))
- .str();
-}
-
-std::string State::dump(const Grammar &G, unsigned Indent) const {
- std::string Result;
- llvm::raw_string_ostream OS(Result);
- for (const auto &Item : Items)
- OS.indent(Indent) << llvm::formatv("{0}\n", Item.dump(G));
- return OS.str();
-}
-
-std::string LRGraph::dumpForTests(const Grammar &G) const {
- std::string Result;
- llvm::raw_string_ostream OS(Result);
- OS << "States:\n";
- for (StateID ID = 0; ID < States.size(); ++ID) {
- OS << llvm::formatv("State {0}\n", ID);
- OS << States[ID].dump(G, /*Indent*/ 4);
- }
- for (const auto &E : Edges) {
- OS << llvm::formatv("{0} ->[{1}] {2}\n", E.Src, G.symbolName(E.Label),
- E.Dst);
- }
- return OS.str();
-}
-
-LRGraph LRGraph::buildLR0(const Grammar &G) {
- class Builder {
- public:
- Builder(const Grammar &G) : G(G) {}
-
- // Adds a given state if not existed.
- std::pair<StateID, /*inserted*/ bool> insert(ItemSet KernelItems) {
- assert(llvm::is_sorted(KernelItems) &&
- "Item must be sorted before inserting to a hash map!");
- auto It = StatesIndex.find(KernelItems);
- if (It != StatesIndex.end())
- return {It->second, false};
- States.push_back(closure(KernelItems, G));
- StateID NextStateID = States.size() - 1;
- StatesIndex.insert({std::move(KernelItems), NextStateID});
- return {NextStateID, true};
- }
-
- void insertEdge(StateID Src, StateID Dst, SymbolID Label) {
- Edges.push_back({Src, Dst, Label});
- }
-
- // Returns a state with the given id.
- const State &find(StateID ID) const {
- assert(ID < States.size());
- return States[ID];
- }
-
- LRGraph build() && {
- States.shrink_to_fit();
- Edges.shrink_to_fit();
- return LRGraph(std::move(States), std::move(Edges));
- }
-
- private:
- // Key is the **kernel** item sets.
- llvm::DenseMap<ItemSet, /*index of States*/ size_t> StatesIndex;
- std::vector<State> States;
- std::vector<Edge> Edges;
- const Grammar &G;
- } Builder(G);
-
- std::vector<StateID> PendingStates;
- // Initialize states with the start symbol.
- auto RRange = G.table().Nonterminals[G.startSymbol()].RuleRange;
- for (RuleID RID = RRange.start; RID < RRange.end; ++RID) {
- auto StartState = std::vector<Item>{Item::start(RID, G)};
- auto Result = Builder.insert(std::move(StartState));
- assert(Result.second && "State must be new");
- PendingStates.push_back(Result.first);
- }
-
- while (!PendingStates.empty()) {
- auto CurrentStateID = PendingStates.back();
- PendingStates.pop_back();
- for (auto Next :
- nextAvailableKernelItems(Builder.find(CurrentStateID), G)) {
- auto Insert = Builder.insert(Next.second);
- if (Insert.second) // new state, insert to the pending queue.
- PendingStates.push_back(Insert.first);
- Builder.insertEdge(CurrentStateID, Insert.first, Next.first);
- }
- }
- return std::move(Builder).build();
-}
-
-} // namespace pseudo
-} // namespace syntax
-} // namespace clang
+++ /dev/null
-//===--- LRTable.cpp - Parsing table for LR parsers --------------*- C++-*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "clang/Tooling/Syntax/Pseudo/LRTable.h"
-#include "clang/Tooling/Syntax/Pseudo/Grammar.h"
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/FormatVariadic.h"
-#include "llvm/Support/raw_ostream.h"
-
-namespace clang {
-namespace syntax {
-namespace pseudo {
-
-llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const LRTable::Action &A) {
- switch (A.kind()) {
- case LRTable::Action::Shift:
- return OS << llvm::formatv("shift state {0}", A.getShiftState());
- case LRTable::Action::Reduce:
- return OS << llvm::formatv("reduce by rule {0}", A.getReduceRule());
- case LRTable::Action::GoTo:
- return OS << llvm::formatv("go to state {0}", A.getGoToState());
- case LRTable::Action::Accept:
- return OS << "acc";
- case LRTable::Action::Sentinel:
- llvm_unreachable("unexpected Sentinel action kind!");
- }
- llvm_unreachable("unexpected action kind!");
-}
-
-std::string LRTable::dumpStatistics() const {
- StateID NumOfStates = 0;
- for (StateID It : States)
- NumOfStates = std::max(It, NumOfStates);
- return llvm::formatv(R"(
-Statistics of the LR parsing table:
- number of states: {0}
- number of actions: {1}
- size of the table (bytes): {2}
-)",
- NumOfStates, Actions.size(), bytes())
- .str();
-}
-
-std::string LRTable::dumpForTests(const Grammar &G) const {
- std::string Result;
- llvm::raw_string_ostream OS(Result);
- StateID MaxState = 0;
- for (StateID It : States)
- MaxState = std::max(MaxState, It);
- OS << "LRTable:\n";
- for (StateID S = 0; S <= MaxState; ++S) {
- OS << llvm::formatv("State {0}\n", S);
- for (uint16_t Terminal = 0; Terminal < NumTerminals; ++Terminal) {
- SymbolID TokID = tokenSymbol(static_cast<tok::TokenKind>(Terminal));
- for (auto A : find(S, TokID)) {
- if (A.kind() == LRTable::Action::Shift)
- OS.indent(4) << llvm::formatv("'{0}': shift state {1}\n",
- G.symbolName(TokID), A.getShiftState());
- else if (A.kind() == LRTable::Action::Reduce)
- OS.indent(4) << llvm::formatv("'{0}': reduce by rule {1} '{2}'\n",
- G.symbolName(TokID), A.getReduceRule(),
- G.dumpRule(A.getReduceRule()));
- else if (A.kind() == LRTable::Action::Accept)
- OS.indent(4) << llvm::formatv("'{0}': accept\n", G.symbolName(TokID));
- }
- }
- for (SymbolID NontermID = 0; NontermID < G.table().Nonterminals.size();
- ++NontermID) {
- if (find(S, NontermID).empty())
- continue;
- OS.indent(4) << llvm::formatv("'{0}': go to state {1}\n",
- G.symbolName(NontermID),
- getGoToState(S, NontermID));
- }
- }
- return OS.str();
-}
-
-llvm::ArrayRef<LRTable::Action> LRTable::getActions(StateID State,
- SymbolID Terminal) const {
- assert(pseudo::isToken(Terminal) && "expect terminal symbol!");
- return find(State, Terminal);
-}
-
-LRTable::StateID LRTable::getGoToState(StateID State,
- SymbolID Nonterminal) const {
- assert(pseudo::isNonterminal(Nonterminal) && "expected nonterminal symbol!");
- auto Result = find(State, Nonterminal);
- assert(Result.size() == 1 && Result.front().kind() == Action::GoTo);
- return Result.front().getGoToState();
-}
-
-llvm::ArrayRef<LRTable::Action> LRTable::find(StateID Src, SymbolID ID) const {
- size_t Idx = isToken(ID) ? symbolToToken(ID) : ID;
- assert(isToken(ID) ? Idx + 1 < TerminalOffset.size()
- : Idx + 1 < NontermOffset.size());
- std::pair<size_t, size_t> TargetStateRange =
- isToken(ID) ? std::make_pair(TerminalOffset[Idx], TerminalOffset[Idx + 1])
- : std::make_pair(NontermOffset[Idx], NontermOffset[Idx + 1]);
- auto TargetedStates =
- llvm::makeArrayRef(States.data() + TargetStateRange.first,
- States.data() + TargetStateRange.second);
-
- assert(llvm::is_sorted(TargetedStates) &&
- "subrange of the StateIdx should be sorted!");
- const LRTable::StateID *Start = llvm::partition_point(
- TargetedStates, [&Src](LRTable::StateID S) { return S < Src; });
- if (Start == TargetedStates.end())
- return {};
- const LRTable::StateID *End = Start;
- while (End != TargetedStates.end() && *End == Src)
- ++End;
- return llvm::makeArrayRef(&Actions[Start - States.data()],
- /*length=*/End - Start);
-}
-
-} // namespace pseudo
-} // namespace syntax
-} // namespace clang
+++ /dev/null
-//===--- LRTableBuild.cpp - Build a LRTable from LRGraph ---------*- C++-*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "clang/Basic/TokenKinds.h"
-#include "clang/Tooling/Syntax/Pseudo/Grammar.h"
-#include "clang/Tooling/Syntax/Pseudo/LRGraph.h"
-#include "clang/Tooling/Syntax/Pseudo/LRTable.h"
-#include <cstdint>
-
-namespace llvm {
-template <> struct DenseMapInfo<clang::syntax::pseudo::LRTable::Entry> {
- using Entry = clang::syntax::pseudo::LRTable::Entry;
- static inline Entry getEmptyKey() {
- static Entry E{static_cast<clang::syntax::pseudo::SymbolID>(-1), 0,
- clang::syntax::pseudo::LRTable::Action::sentinel()};
- return E;
- }
- static inline Entry getTombstoneKey() {
- static Entry E{static_cast<clang::syntax::pseudo::SymbolID>(-2), 0,
- clang::syntax::pseudo::LRTable::Action::sentinel()};
- return E;
- }
- static unsigned getHashValue(const Entry &I) {
- return llvm::hash_combine(I.State, I.Symbol, I.Act.opaque());
- }
- static bool isEqual(const Entry &LHS, const Entry &RHS) {
- return LHS.State == RHS.State && LHS.Symbol == RHS.Symbol &&
- LHS.Act == RHS.Act;
- }
-};
-} // namespace llvm
-
-namespace clang {
-namespace syntax {
-namespace pseudo {
-
-class LRTable::Builder {
-public:
- bool insert(Entry E) { return Entries.insert(std::move(E)).second; }
- LRTable build(const GrammarTable >) && {
- // E.g. given the following parsing table with 3 states and 3 terminals:
- //
- // a b c
- // +-------+----+-------+-+
- // |state0 | | s0,r0 | |
- // |state1 | acc| | |
- // |state2 | | r1 | |
- // +-------+----+-------+-+
- //
- // The final LRTable:
- // - TerminalOffset: [a] = 0, [b] = 1, [c] = 4, [d] = 4 (d is a sentinel)
- // - States: [ 1, 0, 0, 2]
- // Actions: [ acc, s0, r0, r1]
- // ~~~ corresponding range for terminal a
- // ~~~~~~~~~~ corresponding range for terminal b
- // First step, we sort all entries by (Symbol, State, Action).
- std::vector<Entry> Sorted(Entries.begin(), Entries.end());
- llvm::sort(Sorted, [](const Entry &L, const Entry &R) {
- return std::forward_as_tuple(L.Symbol, L.State, L.Act.opaque()) <
- std::forward_as_tuple(R.Symbol, R.State, R.Act.opaque());
- });
-
- LRTable Table;
- Table.Actions.reserve(Sorted.size());
- Table.States.reserve(Sorted.size());
- // We are good to finalize the States and Actions.
- for (const auto &E : Sorted) {
- Table.Actions.push_back(E.Act);
- Table.States.push_back(E.State);
- }
- // Initialize the terminal and nonterminal offset, all ranges are empty by
- // default.
- Table.TerminalOffset = std::vector<uint32_t>(GT.Terminals.size() + 1, 0);
- Table.NontermOffset = std::vector<uint32_t>(GT.Nonterminals.size() + 1, 0);
- size_t SortedIndex = 0;
- for (SymbolID NonterminalID = 0; NonterminalID < Table.NontermOffset.size();
- ++NonterminalID) {
- Table.NontermOffset[NonterminalID] = SortedIndex;
- while (SortedIndex < Sorted.size() &&
- Sorted[SortedIndex].Symbol == NonterminalID)
- ++SortedIndex;
- }
- for (size_t Terminal = 0; Terminal < Table.TerminalOffset.size();
- ++Terminal) {
- Table.TerminalOffset[Terminal] = SortedIndex;
- while (SortedIndex < Sorted.size() &&
- Sorted[SortedIndex].Symbol ==
- tokenSymbol(static_cast<tok::TokenKind>(Terminal)))
- ++SortedIndex;
- }
- return Table;
- }
-
-private:
- llvm::DenseSet<Entry> Entries;
-};
-
-LRTable LRTable::buildForTests(const GrammarTable >,
- llvm::ArrayRef<Entry> Entries) {
- Builder Build;
- for (const Entry &E : Entries)
- Build.insert(E);
- return std::move(Build).build(GT);
-}
-
-LRTable LRTable::buildSLR(const Grammar &G) {
- Builder Build;
- auto Graph = LRGraph::buildLR0(G);
- for (const auto &T : Graph.edges()) {
- Action Act = isToken(T.Label) ? Action::shift(T.Dst) : Action::goTo(T.Dst);
- Build.insert({T.Src, T.Label, Act});
- }
- assert(Graph.states().size() <= (1 << StateBits) &&
- "Graph states execceds the maximum limit!");
- auto FollowSets = followSets(G);
- for (StateID SID = 0; SID < Graph.states().size(); ++SID) {
- for (const Item &I : Graph.states()[SID].Items) {
- // If we've just parsed the start symbol, we can accept the input.
- if (G.lookupRule(I.rule()).Target == G.startSymbol() && !I.hasNext()) {
- Build.insert({SID, tokenSymbol(tok::eof), Action::accept(I.rule())});
- continue;
- }
- if (!I.hasNext()) {
- // If we've reached the end of a rule A := ..., then we can reduce if
- // the next token is in the follow set of A.
- for (SymbolID Follow : FollowSets[G.lookupRule(I.rule()).Target]) {
- assert(isToken(Follow));
- Build.insert({SID, Follow, Action::reduce(I.rule())});
- }
- }
- }
- }
- return std::move(Build).build(G.table());
-}
-
-} // namespace pseudo
-} // namespace syntax
-} // namespace clang
+++ /dev/null
-//===--- Lex.cpp - extract token stream from source code ---------*- C++-*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "clang/Basic/SourceLocation.h"
-#include "clang/Basic/TokenKinds.h"
-#include "clang/Lex/Lexer.h"
-#include "clang/Lex/LiteralSupport.h"
-#include "clang/Tooling/Syntax/Pseudo/Token.h"
-
-namespace clang {
-namespace syntax {
-namespace pseudo {
-
-TokenStream lex(const std::string &Code, const clang::LangOptions &LangOpts) {
- clang::SourceLocation Start;
- // Tokenize using clang's lexer in raw mode.
- // std::string guarantees null-termination, which the lexer needs.
- clang::Lexer Lexer(Start, LangOpts, Code.data(), Code.data(),
- Code.data() + Code.size());
- Lexer.SetCommentRetentionState(true);
-
- TokenStream Result;
- clang::Token CT;
- unsigned LastOffset = 0;
- unsigned Line = 0;
- unsigned Indent = 0;
- for (Lexer.LexFromRawLexer(CT); CT.getKind() != clang::tok::eof;
- Lexer.LexFromRawLexer(CT)) {
- unsigned Offset =
- CT.getLocation().getRawEncoding() - Start.getRawEncoding();
-
- Token Tok;
- Tok.Data = &Code[Offset];
- Tok.Length = CT.getLength();
- Tok.Kind = CT.getKind();
-
- // Update current line number and indentation from raw source code.
- unsigned NewLineStart = 0;
- for (unsigned i = LastOffset; i < Offset; ++i) {
- if (Code[i] == '\n') {
- NewLineStart = i + 1;
- ++Line;
- }
- }
- if (NewLineStart || !LastOffset) {
- Indent = 0;
- for (char c : StringRef(Code).slice(NewLineStart, Offset)) {
- if (c == ' ')
- ++Indent;
- else if (c == '\t')
- Indent += 8;
- else
- break;
- }
- }
- Tok.Indent = Indent;
- Tok.Line = Line;
-
- if (CT.isAtStartOfLine())
- Tok.setFlag(LexFlags::StartsPPLine);
- if (CT.needsCleaning() || CT.hasUCN())
- Tok.setFlag(LexFlags::NeedsCleaning);
-
- Result.push(Tok);
- LastOffset = Offset;
- }
- Result.finalize();
- return Result;
-}
-
-TokenStream cook(const TokenStream &Code, const LangOptions &LangOpts) {
- auto CleanedStorage = std::make_shared<llvm::BumpPtrAllocator>();
- clang::IdentifierTable Identifiers(LangOpts);
- TokenStream Result(CleanedStorage);
-
- for (auto Tok : Code.tokens()) {
- if (Tok.flag(LexFlags::NeedsCleaning)) {
- // Remove escaped newlines and trigraphs.
- llvm::SmallString<64> CleanBuffer;
- const char *Pos = Tok.text().begin();
- while (Pos < Tok.text().end()) {
- unsigned CharSize = 0;
- CleanBuffer.push_back(
- clang::Lexer::getCharAndSizeNoWarn(Pos, CharSize, LangOpts));
- assert(CharSize != 0 && "no progress!");
- Pos += CharSize;
- }
- // Remove universal character names (UCN).
- llvm::SmallString<64> UCNBuffer;
- clang::expandUCNs(UCNBuffer, CleanBuffer);
-
- llvm::StringRef Text = llvm::StringRef(UCNBuffer).copy(*CleanedStorage);
- Tok.Data = Text.data();
- Tok.Length = Text.size();
- Tok.Flags &= ~static_cast<decltype(Tok.Flags)>(LexFlags::NeedsCleaning);
- }
- // Cook raw_identifiers into identifier, keyword, etc.
- if (Tok.Kind == tok::raw_identifier)
- Tok.Kind = Identifiers.get(Tok.text()).getTokenID();
- Result.push(std::move(Tok));
- }
-
- Result.finalize();
- return Result;
-}
-
-} // namespace pseudo
-} // namespace syntax
-} // namespace clang
+++ /dev/null
-# clang pseudoparser
-
-This directory implements an approximate heuristic parser for C++, based on the
-clang lexer, the C++ grammar, and the GLR parsing algorithm.
-
-It parses a file in isolation, without reading its included headers.
-The result is a strict syntactic tree whose structure follows the C++ grammar.
-There is no semantic analysis, apart from guesses to disambiguate the parse.
-Disambiguation can optionally be guided by an AST or a symbol index.
-
-For now, the best reference on intended scope is the [design proposal],
-with further discussion on the [RFC].
-
-## Dependencies between pseudoparser and clang
-
-Dependencies are limited because they don't make sense, but also to avoid
-placing a burden on clang mantainers.
-
-The pseudoparser reuses the clang lexer (clangLex and clangBasic libraries) but
-not the higher-level libraries (Parse, Sema, AST, Frontend...).
-
-When the pseudoparser should be used together with an AST (e.g. to guide
-disambiguation), this is a separate "bridge" library that depends on both.
-
-Clang does not depend on the pseudoparser at all. If this seems useful in future
-it should be discussed by RFC.
-
-## Parity between pseudoparser and clang
-
-The pseudoparser aims to understand real-world code, and particularly the
-languages and extensions supported by Clang.
-
-However we don't try to keep these in lockstep: there's no expectation that
-Clang parser changes are accompanied by pseudoparser changes or vice versa.
-
-[design proposal]: https://docs.google.com/document/d/1eGkTOsFja63wsv8v0vd5JdoTonj-NlN3ujGF0T7xDbM/edit
-[RFC]: https://discourse.llvm.org/t/rfc-a-c-pseudo-parser-for-tooling/59217/49
+++ /dev/null
-//===--- Token.cpp - Tokens and token streams in the pseudoparser ---------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "clang/Tooling/Syntax/Pseudo/Token.h"
-#include "llvm/ADT/StringExtras.h"
-#include "llvm/Support/Format.h"
-#include "llvm/Support/FormatVariadic.h"
-
-namespace clang {
-namespace syntax {
-namespace pseudo {
-
-llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const Token &T) {
- OS << llvm::formatv("{0} {1}:{2} ", clang::tok::getTokenName(T.Kind), T.Line,
- T.Indent);
- OS << '"';
- llvm::printEscapedString(T.text(), OS);
- OS << '"';
- if (T.Flags)
- OS << llvm::format(" flags=%x", T.Flags);
- return OS;
-}
-
-llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const TokenStream &TS) {
- OS << "Index Kind Line Text\n";
- for (const auto &T : TS.tokens()) {
- OS << llvm::format("%5d: %16s %4d:%-2d ", TS.index(T),
- clang::tok::getTokenName(T.Kind), T.Line, T.Indent);
- OS << '"';
- llvm::printEscapedString(T.text(), OS);
- OS << '"';
- if (T.Flags)
- OS << llvm::format(" flags=%x", T.Flags);
- OS << '\n';
- }
- return OS;
-}
-
-llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const Token::Range &R) {
- OS << llvm::formatv("[{0},{1})", R.Begin, R.End);
- return OS;
-}
-
-TokenStream::TokenStream(std::shared_ptr<void> Payload)
- : Payload(std::move(Payload)) {
- Storage.emplace_back();
- Storage.back().Kind = clang::tok::eof;
-}
-
-void TokenStream::finalize() {
- assert(!isFinalized());
- unsigned LastLine = Storage.back().Line;
- Storage.emplace_back();
- Storage.back().Kind = tok::eof;
- Storage.back().Line = LastLine + 1;
-
- Tokens = Storage;
- Tokens = Tokens.drop_front().drop_back();
-}
-
-bool TokenStream::isFinalized() const {
- assert(!Storage.empty() && Storage.front().Kind == tok::eof);
- if (Storage.size() == 1)
- return false;
- return Storage.back().Kind == tok::eof;
-}
-
-void TokenStream::print(llvm::raw_ostream &OS) const {
- bool FirstToken = true;
- unsigned LastLine = -1;
- StringRef LastText;
- for (const auto &T : tokens()) {
- StringRef Text = T.text();
- if (FirstToken) {
- FirstToken = false;
- } else if (T.Line == LastLine) {
- if (LastText.data() + LastText.size() != Text.data())
- OS << ' ';
- } else {
- OS << '\n';
- OS.indent(T.Indent);
- }
- OS << Text;
- LastLine = T.Line;
- LastText = Text;
- }
- if (!FirstToken)
- OS << '\n';
-}
-
-TokenStream stripComments(const TokenStream &Input) {
- TokenStream Out;
- for (const Token &T : Input.tokens()) {
- if (T.Kind == tok::comment)
- continue;
- Out.push(T);
- }
- Out.finalize();
- return Out;
-}
-
-} // namespace pseudo
-} // namespace syntax
-} // namespace clang
+++ /dev/null
-# This is a C++ grammar from the C++ standard [1].
-#
-# The grammar is a superset of the true grammar requring semantic constraints to
-# resolve ambiguties. The grammar is context-free and ambiguous (beyond the
-# limit of LR(k)). We use general parsing algorithm (e.g GLR) to handle the
-# grammar and generate a transition table which is used to drive the parsing.
-#
-# It aims to align with the ISO C++ grammar as much as possible. We adjust it
-# to fit the need for the grammar-based parser:
-# - attributes are omitted, which will be handled as comments;
-# - we don't allow nullable non-terminal symbols. There are few nullable
-# non-terminals in the spec grammar, they are adjusted to be non-nullable;
-# - the file merely describes the core C++ grammar. Preprocessor directives and
-# lexical conversions are omitted as we reuse clang's lexer and run a fake
-# preprocessor;
-#
-# Guidelines:
-# - non-terminals are lower_case; terminals (aka tokens) correspond to
-# clang::TokenKind, written as "IDENTIFIER", "USING", "::" etc;
-# - optional symbols are supported, with a _opt suffix;
-#
-# [1] https://isocpp.org/files/papers/N4860.pdf
-#
-#
-# _ serves as a "fake" start symbol, coming with real grammar symbols.
-_ := translation-unit
-
-# gram.key
-typedef-name := IDENTIFIER
-typedef-name := simple-template-id
-namespace-name := IDENTIFIER
-namespace-name := namespace-alias
-namespace-alias := IDENTIFIER
-class-name := IDENTIFIER
-class-name := simple-template-id
-enum-name := IDENTIFIER
-template-name := IDENTIFIER
-
-# gram.basic
-#! Custom modifications to eliminate optional declaration-seq
-translation-unit := declaration-seq
-translation-unit := global-module-fragment_opt module-declaration declaration-seq_opt private-module-fragment_opt
-
-# gram.expr
-# expr.prim
-primary-expression := literal
-primary-expression := THIS
-primary-expression := ( expression )
-primary-expression := id-expression
-primary-expression := lambda-expression
-primary-expression := fold-expression
-primary-expression := requires-expression
-id-expression := unqualified-id
-id-expression := qualified-id
-unqualified-id := IDENTIFIER
-unqualified-id := operator-function-id
-unqualified-id := conversion-function-id
-unqualified-id := literal-operator-id
-unqualified-id := ~ type-name
-unqualified-id := ~ decltype-specifier
-unqualified-id := template-id
-qualified-id := nested-name-specifier TEMPLATE_opt unqualified-id
-nested-name-specifier := ::
-nested-name-specifier := type-name ::
-nested-name-specifier := namespace-name ::
-nested-name-specifier := decltype-specifier ::
-nested-name-specifier := nested-name-specifier IDENTIFIER ::
-nested-name-specifier := nested-name-specifier TEMPLATE_opt simple-template-id ::
-lambda-expression := lambda-introducer lambda-declarator_opt compound-statement
-lambda-expression := lambda-introducer < template-parameter-list > requires-clause_opt lambda-declarator_opt compound-statement
-lambda-introducer := [ lambda-capture_opt ]
-lambda-declarator := ( parameter-declaration-clause_opt ) decl-specifier-seq_opt noexcept-specifier_opt trailing-return-type_opt requires-clause_opt
-lambda-capture := capture-default
-lambda-capture := capture-list
-lambda-capture := capture-default , capture-list
-capture-default := &
-capture-default := =
-capture-list := capture
-capture-list := capture-list , capture
-capture := simple-capture
-capture := init-capture
-simple-capture := IDENTIFIER ..._opt
-simple-capture := & IDENTIFIER ..._opt
-simple-capture := THIS
-simple-capture := * THIS
-init-capture := ..._opt IDENTIFIER initializer
-init-capture := & ..._opt IDENTIFIER initializer
-fold-expression := ( cast-expression fold-operator ... )
-fold-expression := ( ... fold-operator cast-expression )
-fold-expression := ( cast-expression fold-operator ... fold-operator cast-expression )
-fold-operator := +
-fold-operator := -
-fold-operator := *
-fold-operator := /
-fold-operator := %
-fold-operator := ^
-fold-operator := |
-fold-operator := <<
-fold-operator := >>
-fold-operator := +=
-fold-operator := -=
-fold-operator := *=
-fold-operator := /=
-fold-operator := %=
-fold-operator := ^=
-fold-operator := &=
-fold-operator := |=
-fold-operator := <<=
-fold-operator := >>=
-fold-operator := =
-fold-operator := ==
-fold-operator := !=
-fold-operator := <
-fold-operator := >
-fold-operator := <=
-fold-operator := >=
-fold-operator := &&
-fold-operator := ||
-fold-operator := ,
-fold-operator := .*
-fold-operator := ->*
-requires-expression := REQUIRES requirement-parameter-list_opt requirement-body
-requirement-parameter-list := ( parameter-declaration-clause_opt )
-requirement-body := { requirement-seq }
-requirement-seq := requirement
-requirement-seq := requirement-seq requirement
-requirement := simple-requirement
-requirement := type-requirement
-requirement := compound-requirement
-requirement := nested-requirement
-simple-requirement := expression ;
-type-requirement := TYPENAME nested-name-specifier_opt type-name ;
-compound-requirement := { expression } NOEXCEPT_opt return-type-requirement_opt ;
-return-type-requirement := -> type-constraint
-nested-requirement := REQUIRES constraint-expression ;
-# expr.post
-postfix-expression := primary-expression
-postfix-expression := postfix-expression [ expr-or-braced-init-list ]
-postfix-expression := postfix-expression ( expression-list_opt )
-postfix-expression := simple-type-specifier ( expression-list_opt )
-postfix-expression := typename-specifier ( expression-list_opt )
-postfix-expression := simple-type-specifier braced-init-list
-postfix-expression := postfix-expression . TEMPLATE_opt id-expression
-postfix-expression := postfix-expression -> TEMPLATE_opt id-expression
-postfix-expression := postfix-expression ++
-postfix-expression := postfix-expression --
-postfix-expression := DYNAMIC_CAST < type-id > ( expression )
-postfix-expression := STATIC_CAST < type-id > ( expression )
-postfix-expression := REINTERPRET_CAST < type-id > ( expression )
-postfix-expression := CONST_CAST < type-id > ( expression )
-postfix-expression := TYPEID ( expression )
-postfix-expression := TYPEID ( type-id )
-expression-list := initializer-list
-# expr.unary
-unary-expression := postfix-expression
-unary-expression := unary-operator cast-expression
-unary-expression := ++ cast-expression
-unary-expression := -- cast-expression
-unary-expression := await-expression
-unary-expression := SIZEOF unary-expression
-unary-expression := SIZEOF ( type-id )
-unary-expression := SIZEOF ... ( IDENTIFIER )
-unary-expression := ALIGNOF ( type-id )
-unary-expression := noexcept-expression
-unary-expression := new-expression
-unary-expression := delete-expression
-unary-operator := *
-unary-operator := &
-unary-operator := +
-unary-operator := -
-unary-operator := !
-unary-operator := ~
-await-expression := CO_AWAIT cast-expression
-noexcept-expression := NOEXCEPT ( expression )
-new-expression := ::_opt NEW new-placement_opt new-type-id new-initializer_opt
-new-expression := ::_opt NEW new-placement_opt ( type-id ) new-initializer_opt
-new-placement := ( expression-list )
-new-type-id := type-specifier-seq new-declarator_opt
-new-declarator := ptr-operator new-declarator_opt
-new-declarator := noptr-new-declarator
-noptr-new-declarator := [ expression_opt ]
-noptr-new-declarator := noptr-new-declarator [ constant-expression ]
-new-initializer := ( expression-list_opt )
-new-initializer := braced-init-list
-delete-expression := ::_opt DELETE cast-expression
-delete-expression := ::_opt DELETE [ ] cast-expression
-cast-expression := unary-expression
-cast-expression := ( type-id ) cast-expression
-# expr.mptr.oper
-pm-expression := cast-expression
-pm-expression := pm-expression .* cast-expression
-pm-expression := pm-expression ->* cast-expression
-# expr.mul
-multiplicative-expression := pm-expression
-multiplicative-expression := multiplicative-expression * pm-expression
-multiplicative-expression := multiplicative-expression / pm-expression
-multiplicative-expression := multiplicative-expression % pm-expression
-# expr.add
-additive-expression := multiplicative-expression
-additive-expression := additive-expression + multiplicative-expression
-additive-expression := additive-expression - multiplicative-expression
-# expr.shift
-shift-expression := additive-expression
-shift-expression := shift-expression << additive-expression
-shift-expression := shift-expression >> additive-expression
-# expr.spaceship
-compare-expression := shift-expression
-compare-expression := compare-expression <=> shift-expression
-# expr.rel
-relational-expression := compare-expression
-relational-expression := relational-expression < compare-expression
-relational-expression := relational-expression > compare-expression
-relational-expression := relational-expression <= compare-expression
-relational-expression := relational-expression >= compare-expression
-# expr.eq
-equality-expression := relational-expression
-equality-expression := equality-expression == relational-expression
-equality-expression := equality-expression != relational-expression
-# expr.bit.and
-and-expression := equality-expression
-and-expression := and-expression & equality-expression
-# expr.xor
-exclusive-or-expression := and-expression
-exclusive-or-expression := exclusive-or-expression ^ and-expression
-# expr.or
-inclusive-or-expression := exclusive-or-expression
-inclusive-or-expression := inclusive-or-expression | exclusive-or-expression
-# expr.log.and
-logical-and-expression := inclusive-or-expression
-logical-and-expression := logical-and-expression && inclusive-or-expression
-# expr.log.or
-logical-or-expression := logical-and-expression
-logical-or-expression := logical-or-expression || logical-and-expression
-# expr.cond
-conditional-expression := logical-or-expression
-conditional-expression := logical-or-expression ? expression : assignment-expression
-# expr.ass
-yield-expression := CO_YIELD assignment-expression
-yield-expression := CO_YIELD braced-init-list
-throw-expression := THROW assignment-expression_opt
-assignment-expression := conditional-expression
-assignment-expression := yield-expression
-assignment-expression := throw-expression
-assignment-expression := logical-or-expression assignment-operator initializer-clause
-assignment-operator := =
-assignment-operator := *=
-assignment-operator := /=
-assignment-operator := %=
-assignment-operator := +=
-assignment-operator := -=
-assignment-operator := >>=
-assignment-operator := <<=
-assignment-operator := &=
-assignment-operator := ^=
-assignment-operator := |=
-# expr.comma
-expression := assignment-expression
-expression := expression , assignment-expression
-# expr.const
-constant-expression := conditional-expression
-
-# gram.stmt
-statement := labeled-statement
-statement := expression-statement
-statement := compound-statement
-statement := selection-statement
-statement := iteration-statement
-statement := jump-statement
-statement := declaration-statement
-statement := try-block
-init-statement := expression-statement
-init-statement := simple-declaration
-condition := expression
-condition := decl-specifier-seq declarator brace-or-equal-initializer
-labeled-statement := IDENTIFIER : statement
-labeled-statement := CASE constant-expression : statement
-labeled-statement := DEFAULT : statement
-expression-statement := expression_opt ;
-compound-statement := { statement-seq_opt }
-statement-seq := statement
-statement-seq := statement-seq statement
-selection-statement := IF CONSTEXPR_opt ( init-statement_opt condition ) statement
-selection-statement := IF CONSTEXPR_opt ( init-statement_opt condition ) statement ELSE statement
-selection-statement := SWITCH ( init-statement_opt condition ) statement
-iteration-statement := WHILE ( condition ) statement
-iteration-statement := DO statement WHILE ( expression ) ;
-iteration-statement := FOR ( init-statement condition_opt ; expression_opt ) statement
-iteration-statement := FOR ( init-statement_opt for-range-declaration : for-range-initializer ) statement
-for-range-declaration := decl-specifier-seq declarator
-for-range-declaration := decl-specifier-seq ref-qualifier_opt [ identifier-list ]
-for-range-initializer := expr-or-braced-init-list
-jump-statement := BREAK ;
-jump-statement := CONTINUE ;
-jump-statement := RETURN expr-or-braced-init-list_opt ;
-jump-statement := coroutine-return-statement
-jump-statement := GOTO IDENTIFIER ;
-coroutine-return-statement := CO_RETURN expr-or-braced-init-list_opt ;
-declaration-statement := block-declaration
-
-# gram.dcl
-declaration-seq := declaration
-declaration-seq := declaration-seq declaration
-declaration := block-declaration
-declaration := nodeclspec-function-declaration
-declaration := function-definition
-declaration := template-declaration
-declaration := deduction-guide
-declaration := explicit-instantiation
-declaration := explicit-specialization
-declaration := export-declaration
-declaration := linkage-specification
-declaration := namespace-definition
-declaration := empty-declaration
-declaration := module-import-declaration
-block-declaration := simple-declaration
-block-declaration := asm-declaration
-block-declaration := namespace-alias-definition
-block-declaration := using-declaration
-block-declaration := using-enum-declaration
-block-declaration := using-directive
-block-declaration := static_assert-declaration
-block-declaration := alias-declaration
-block-declaration := opaque-enum-declaration
-nodeclspec-function-declaration := declarator ;
-alias-declaration := USING IDENTIFIER = defining-type-id ;
-simple-declaration := decl-specifier-seq init-declarator-list_opt ;
-simple-declaration := decl-specifier-seq ref-qualifier_opt [ identifier-list ] initializer ;
-static_assert-declaration := STATIC_ASSERT ( constant-expression ) ;
-static_assert-declaration := STATIC_ASSERT ( constant-expression , string-literal ) ;
-empty-declaration := ;
-# dcl.spec
-decl-specifier := storage-class-specifier
-decl-specifier := defining-type-specifier
-decl-specifier := function-specifier
-decl-specifier := FRIEND
-decl-specifier := TYPEDEF
-decl-specifier := CONSTEXPR
-decl-specifier := CONSTEVAL
-decl-specifier := CONSTINIT
-decl-specifier := INLINE
-decl-specifier-seq := decl-specifier
-decl-specifier-seq := decl-specifier decl-specifier-seq
-storage-class-specifier := STATIC
-storage-class-specifier := THREAD_LOCAL
-storage-class-specifier := EXTERN
-storage-class-specifier := MUTABLE
-function-specifier := VIRTUAL
-function-specifier := explicit-specifier
-explicit-specifier := EXPLICIT ( constant-expression )
-explicit-specifier := EXPLICIT
-type-specifier := simple-type-specifier
-type-specifier := elaborated-type-specifier
-type-specifier := typename-specifier
-type-specifier := cv-qualifier
-type-specifier-seq := type-specifier
-type-specifier-seq := type-specifier type-specifier-seq
-defining-type-specifier := type-specifier
-defining-type-specifier := class-specifier
-defining-type-specifier := enum-specifier
-defining-type-specifier-seq := defining-type-specifier
-defining-type-specifier-seq := defining-type-specifier defining-type-specifier-seq
-simple-type-specifier := nested-name-specifier_opt type-name
-simple-type-specifier := nested-name-specifier TEMPLATE simple-template-id
-simple-type-specifier := decltype-specifier
-simple-type-specifier := placeholder-type-specifier
-simple-type-specifier := nested-name-specifier_opt template-name
-simple-type-specifier := CHAR
-simple-type-specifier := CHAR8_T
-simple-type-specifier := CHAR16_T
-simple-type-specifier := CHAR32_T
-simple-type-specifier := WCHAR_T
-simple-type-specifier := BOOL
-simple-type-specifier := SHORT
-simple-type-specifier := INT
-simple-type-specifier := LONG
-simple-type-specifier := SIGNED
-simple-type-specifier := UNSIGNED
-simple-type-specifier := FLOAT
-simple-type-specifier := DOUBLE
-simple-type-specifier := VOID
-type-name := class-name
-type-name := enum-name
-type-name := typedef-name
-elaborated-type-specifier := class-key nested-name-specifier_opt IDENTIFIER
-elaborated-type-specifier := class-key simple-template-id
-elaborated-type-specifier := class-key nested-name-specifier TEMPLATE_opt simple-template-id
-elaborated-type-specifier := elaborated-enum-specifier
-elaborated-enum-specifier := ENUM nested-name-specifier_opt IDENTIFIER
-decltype-specifier := DECLTYPE ( expression )
-placeholder-type-specifier := type-constraint_opt AUTO
-placeholder-type-specifier := type-constraint_opt DECLTYPE ( AUTO )
-init-declarator-list := init-declarator
-init-declarator-list := init-declarator-list , init-declarator
-init-declarator := declarator initializer_opt
-init-declarator := declarator requires-clause
-declarator := ptr-declarator
-declarator := noptr-declarator parameters-and-qualifiers trailing-return-type
-ptr-declarator := noptr-declarator
-ptr-declarator := ptr-operator ptr-declarator
-noptr-declarator := declarator-id
-noptr-declarator := noptr-declarator parameters-and-qualifiers
-noptr-declarator := noptr-declarator [ constant-expression_opt ]
-noptr-declarator := ( ptr-declarator )
-parameters-and-qualifiers := ( parameter-declaration-list_opt ) cv-qualifier-seq_opt ref-qualifier_opt noexcept-specifier_opt
-trailing-return-type := -> type-id
-ptr-operator := * cv-qualifier-seq_opt
-ptr-operator := &
-ptr-operator := &&
-ptr-operator := nested-name-specifier * cv-qualifier-seq_opt
-cv-qualifier-seq := cv-qualifier cv-qualifier-seq_opt
-cv-qualifier := CONST
-cv-qualifier := VOLATILE
-ref-qualifier := &
-ref-qualifier := &&
-declarator-id := ..._opt id-expression
-type-id := type-specifier-seq abstract-declarator_opt
-defining-type-id := defining-type-specifier-seq abstract-declarator_opt
-abstract-declarator := ptr-abstract-declarator
-abstract-declarator := noptr-abstract-declarator_opt parameters-and-qualifiers trailing-return-type
-abstract-declarator := abstract-pack-declarator
-ptr-abstract-declarator := noptr-abstract-declarator
-ptr-abstract-declarator := ptr-operator ptr-abstract-declarator_opt
-noptr-abstract-declarator := noptr-abstract-declarator_opt parameters-and-qualifiers
-noptr-abstract-declarator := noptr-abstract-declarator_opt [ constant-expression ]
-noptr-abstract-declarator := ( ptr-abstract-declarator )
-abstract-pack-declarator := noptr-abstract-pack-declarator
-abstract-pack-declarator := ptr-operator abstract-pack-declarator
-noptr-abstract-pack-declarator := noptr-abstract-pack-declarator parameters-and-qualifiers
-noptr-abstract-pack-declarator := noptr-abstract-pack-declarator [ constant-expression_opt ]
-noptr-abstract-pack-declarator := ...
-#! Custom modifications to avoid nullable clause.
-parameter-declaration-clause := parameter-declaration-list
-parameter-declaration-clause := parameter-declaration-list_opt ...
-parameter-declaration-clause := parameter-declaration-list , ...
-parameter-declaration-list := parameter-declaration
-parameter-declaration-list := parameter-declaration-list , parameter-declaration
-parameter-declaration := decl-specifier-seq declarator
-parameter-declaration := decl-specifier-seq declarator = initializer-clause
-parameter-declaration := decl-specifier-seq abstract-declarator_opt
-parameter-declaration := decl-specifier-seq abstract-declarator_opt = initializer-clause
-# dcl.init
-initializer := brace-or-equal-initializer
-initializer := ( expression-list )
-brace-or-equal-initializer := = initializer-clause
-brace-or-equal-initializer := braced-init-list
-initializer-clause := assignment-expression
-initializer-clause := braced-init-list
-braced-init-list := { initializer-list ,_opt }
-braced-init-list := { designated-initializer-list ,_opt }
-braced-init-list := { }
-initializer-list := initializer-clause ..._opt
-initializer-list := initializer-list , initializer-clause ..._opt
-designated-initializer-list := designated-initializer-clause
-designated-initializer-list := designated-initializer-list , designated-initializer-clause
-designated-initializer-clause := designator brace-or-equal-initializer
-designator := . IDENTIFIER
-expr-or-braced-init-list := expression
-expr-or-braced-init-list := braced-init-list
-# dcl.fct
-function-definition := decl-specifier-seq_opt declarator virt-specifier-seq_opt function-body
-function-definition := decl-specifier-seq_opt declarator requires-clause function-body
-function-body := ctor-initializer_opt compound-statement
-function-body := function-try-block
-function-body := = DEFAULT ;
-function-body := = DELETE ;
-# dcl.enum
-enum-specifier := enum-head { enumerator-list_opt }
-enum-specifier := enum-head { enumerator-list , }
-enum-head := enum-key enum-head-name_opt enum-base_opt
-enum-head-name := nested-name-specifier_opt IDENTIFIER
-opaque-enum-declaration := enum-key enum-head-name enum-base_opt ;
-enum-key := ENUM
-enum-key := ENUM CLASS
-enum-key := ENUM STRUCT
-enum-base := : type-specifier-seq
-enumerator-list := enumerator-definition
-enumerator-list := enumerator-list , enumerator-definition
-enumerator-definition := enumerator
-enumerator-definition := enumerator = constant-expression
-enumerator := IDENTIFIER
-using-enum-declaration := USING elaborated-enum-specifier ;
-# basic.namespace
-namespace-definition := named-namespace-definition
-namespace-definition := unnamed-namespace-definition
-namespace-definition := nested-namespace-definition
-named-namespace-definition := INLINE_opt NAMESPACE IDENTIFIER { namespace-body_opt }
-unnamed-namespace-definition := INLINE_opt NAMESPACE { namespace-body_opt }
-nested-namespace-definition := NAMESPACE enclosing-namespace-specifier :: INLINE_opt IDENTIFIER { namespace-body }
-enclosing-namespace-specifier := IDENTIFIER
-enclosing-namespace-specifier := enclosing-namespace-specifier :: INLINE_opt IDENTIFIER
-#! Custom modification to avoid nullable namespace-body.
-namespace-body := declaration-seq
-namespace-alias-definition := NAMESPACE IDENTIFIER = qualified-namespace-specifier ;
-qualified-namespace-specifier := nested-name-specifier_opt namespace-name
-using-directive := USING NAMESPACE nested-name-specifier_opt namespace-name ;
-using-declaration := USING using-declarator-list ;
-using-declarator-list := using-declarator ..._opt
-using-declarator-list := using-declarator-list , using-declarator ..._opt
-using-declarator := TYPENAME_opt nested-name-specifier unqualified-id
-# dcl.asm
-asm-declaration := ASM ( string-literal ) ;
-# dcl.link
-linkage-specification := EXTERN string-literal { declaration-seq_opt }
-linkage-specification := EXTERN string-literal declaration
-
-# gram.module
-module-declaration := export-keyword_opt module-keyword module-name module-partition_opt
-module-name := module-name-qualifier_opt IDENTIFIER
-module-partition := : module-name-qualifier_opt IDENTIFIER
-module-name-qualifier := IDENTIFIER .
-module-name-qualifier := module-name-qualifier IDENTIFIER .
-export-declaration := EXPORT declaration
-export-declaration := EXPORT ( declaration-seq_opt )
-export-declaration := export-keyword module-import-declaration
-module-import-declaration := import-keyword module-name
-module-import-declaration := import-keyword module-partition
-# FIXME: we don't have header-name in the grammar. Handle these in PP?
-# module-import-declaration := import-keyword header-name
-global-module-fragment := module-keyword ; declaration-seq_opt
-private-module-fragment := module-keyword : PRIVATE ; declaration-seq_opt
-
-# gram.class
-class-specifier := class-head { member-specification_opt }
-class-head := class-key class-head-name class-virt-specifier_opt base-clause_opt
-class-head := class-key base-clause_opt
-class-head-name := nested-name-specifier_opt class-name
-class-virt-specifier := contextual-final
-class-key := CLASS
-class-key := STRUCT
-class-key := UNION
-member-specification := member-declaration member-specification_opt
-member-specification := access-specifier : member-declaration member-specification_opt
-member-declaration := decl-specifier-seq_opt member-declarator-list_opt ;
-member-declaration := function-definition
-member-declaration := using-declaration
-member-declaration := using-enum-declaration
-member-declaration := static_assert-declaration
-member-declaration := template-declaration
-member-declaration := explicit-specialization
-member-declaration := deduction-guide
-member-declaration := alias-declaration
-member-declaration := opaque-enum-declaration
-member-declaration := empty-declaration
-member-declarator-list := member-declarator
-member-declarator-list := member-declarator-list , member-declarator
-member-declarator := declarator virt-specifier-seq_opt pure-specifier_opt
-member-declarator := declarator requires-clause
-member-declarator := declarator brace-or-equal-initializer
-member-declarator := IDENTIFIER_opt : constant-expression brace-or-equal-initializer_opt
-virt-specifier-seq := virt-specifier
-virt-specifier-seq := virt-specifier-seq virt-specifier
-virt-specifier := contextual-override
-virt-specifier := contextual-final
-pure-specifier := = contextual-zero
-conversion-function-id := OPERATOR conversion-type-id
-conversion-type-id := type-specifier-seq conversion-declarator_opt
-conversion-declarator := ptr-operator conversion-declarator_opt
-base-clause := : base-specifier-list
-base-specifier-list := base-specifier ..._opt
-base-specifier-list := base-specifier-list , base-specifier ..._opt
-base-specifier := class-or-decltype
-base-specifier := VIRTUAL access-specifier_opt class-or-decltype
-base-specifier := access-specifier VIRTUAL_opt class-or-decltype
-class-or-decltype := nested-name-specifier_opt type-name
-class-or-decltype := nested-name-specifier TEMPLATE simple-template-id
-class-or-decltype := decltype-specifier
-access-specifier := PRIVATE
-access-specifier := PROTECTED
-access-specifier := PUBLIC
-ctor-initializer := : mem-initializer-list
-mem-initializer-list := mem-initializer ..._opt
-mem-initializer-list := mem-initializer-list , mem-initializer ..._opt
-mem-initializer := mem-initializer-id ( expression-list_opt )
-mem-initializer := mem-initializer-id braced-init-list
-mem-initializer-id := class-or-decltype
-mem-initializer-id := IDENTIFIER
-
-# gram.over
-operator-function-id := OPERATOR operator-name
-operator-name := NEW
-operator-name := DELETE
-operator-name := NEW [ ]
-operator-name := DELETE [ ]
-operator-name := CO_AWAIT
-operator-name := ( )
-operator-name := [ ]
-operator-name := ->
-operator-name := ->*
-operator-name := ~
-operator-name := !
-operator-name := +
-operator-name := -
-operator-name := *
-operator-name := /
-operator-name := %
-operator-name := ^
-operator-name := &
-operator-name := |
-operator-name := =
-operator-name := +=
-operator-name := -=
-operator-name := *=
-operator-name := /=
-operator-name := %=
-operator-name := ^=
-operator-name := &=
-operator-name := |=
-operator-name := ==
-operator-name := !=
-operator-name := <
-operator-name := >
-operator-name := <=
-operator-name := >=
-operator-name := <=>
-operator-name := ^^
-operator-name := ||
-operator-name := <<
-operator-name := >>
-operator-name := <<=
-operator-name := >>=
-operator-name := ++
-operator-name := --
-operator-name := ,
-literal-operator-id := OPERATOR string-literal IDENTIFIER
-literal-operator-id := OPERATOR user-defined-string-literal
-
-# gram.temp
-template-declaration := template-head declaration
-template-declaration := template-head concept-definition
-template-head := TEMPLATE < template-parameter-list > requires-clause_opt
-template-parameter-list := template-parameter
-template-parameter-list := template-parameter-list , template-parameter
-requires-clause := REQUIRES constraint-logical-or-expression
-constraint-logical-or-expression := constraint-logical-and-expression
-constraint-logical-or-expression := constraint-logical-or-expression || constraint-logical-and-expression
-constraint-logical-and-expression := primary-expression
-constraint-logical-and-expression := constraint-logical-and-expression && primary-expression
-template-parameter := type-parameter
-template-parameter := parameter-declaration
-type-parameter := type-parameter-key ..._opt IDENTIFIER
-type-parameter := type-parameter-key IDENTIFIER_opt = type-id
-type-parameter := type-constraint ..._opt IDENTIFIER_opt
-type-parameter := type-constraint IDENTIFIER_opt = type-id
-type-parameter := template-head type-parameter-key ..._opt IDENTIFIER_opt
-type-parameter := template-head type-parameter-key IDENTIFIER_opt = id-expression
-type-parameter-key := CLASS
-type-parameter-key := TYPENAME
-type-constraint := nested-name-specifier_opt concept-name
-type-constraint := nested-name-specifier_opt concept-name < template-argument-list_opt >
-simple-template-id := template-name < template-argument-list_opt >
-template-id := simple-template-id
-template-id := operator-function-id < template-argument-list_opt >
-template-id := literal-operator-id < template-argument-list_opt >
-template-argument-list := template-argument ..._opt
-template-argument-list := template-argument-list , template-argument ..._opt
-template-argument := constant-expression
-template-argument := type-id
-template-argument := id-expression
-constraint-expression := logical-or-expression
-deduction-guide := explicit-specifier_opt template-name ( parameter-declaration-list_opt ) -> simple-template-id ;
-concept-definition := CONCEPT concept-name = constraint-expression ;
-concept-name := IDENTIFIER
-typename-specifier := TYPENAME nested-name-specifier IDENTIFIER
-typename-specifier := TYPENAME nested-name-specifier TEMPLATE_opt simple-template-id
-explicit-instantiation := EXTERN_opt TEMPLATE declaration
-explicit-specialization := TEMPLATE < > declaration
-
-# gram.except
-try-block := TRY compound-statement handler-seq
-function-try-block := TRY ctor-initializer_opt compound-statement handler-seq
-handler-seq := handler handler-seq_opt
-handler := CATCH ( exception-declaration ) compound-statement
-exception-declaration := type-specifier-seq declarator
-exception-declaration := type-specifier-seq abstract-declarator_opt
-noexcept-specifier := NOEXCEPT ( constant-expression )
-noexcept-specifier := NOEXCEPT
-
-# gram.cpp
-identifier-list := IDENTIFIER
-identifier-list := identifier-list , IDENTIFIER
-
-# gram.lex
-#! As we use clang lexer, most of lexical symbols are not needed, we only add
-#! needed literals.
-literal := integer-literal
-literal := character-literal
-literal := floating-point-literal
-literal := string-literal
-literal := boolean-literal
-literal := pointer-literal
-literal := user-defined-literal
-integer-literal := NUMERIC_CONSTANT
-character-literal := CHAR_CONSTANT
-character-literal := WIDE_CHAR_CONSTANT
-character-literal := UTF8_CHAR_CONSTANT
-character-literal := UTF16_CHAR_CONSTANT
-character-literal := UTF32_CHAR_CONSTANT
-floating-point-literal := NUMERIC_CONSTANT
-string-literal-chunk := STRING_LITERAL
-string-literal-chunk := WIDE_STRING_LITERAL
-string-literal-chunk := UTF8_STRING_LITERAL
-string-literal-chunk := UTF16_STRING_LITERAL
-string-literal-chunk := UTF32_STRING_LITERAL
-#! Technically, string concatenation happens at phase 6 which is before parsing,
-#! so it doesn't belong to the grammar. However, we extend the grammar to
-#! support it, to make the pseudo parser fully functional on practical code.
-string-literal := string-literal-chunk
-string-literal := string-literal string-literal-chunk
-user-defined-literal := user-defined-integer-literal
-user-defined-literal := user-defined-floating-point-literal
-user-defined-literal := user-defined-string-literal
-user-defined-literal := user-defined-character-literal
-user-defined-integer-literal := NUMERIC_CONSTANT
-user-defined-string-literal-chunk := STRING_LITERAL
-user-defined-string-literal-chunk := WIDE_STRING_LITERAL
-user-defined-string-literal-chunk := UTF8_STRING_LITERAL
-user-defined-string-literal-chunk := UTF16_STRING_LITERAL
-user-defined-string-literal-chunk := UTF32_STRING_LITERAL
-user-defined-string-literal := user-defined-string-literal-chunk
-user-defined-string-literal := string-literal-chunk user-defined-string-literal
-user-defined-string-literal := user-defined-string-literal string-literal-chunk
-user-defined-floating-point-literal := NUMERIC_CONSTANT
-user-defined-character-literal := CHAR_CONSTANT
-user-defined-character-literal := WIDE_CHAR_CONSTANT
-user-defined-character-literal := UTF8_CHAR_CONSTANT
-user-defined-character-literal := UTF16_CHAR_CONSTANT
-user-defined-character-literal := UTF32_CHAR_CONSTANT
-boolean-literal := FALSE
-boolean-literal := TRUE
-pointer-literal := NULLPTR
-
-#! Contextual keywords -- clang lexer always lexes them as identifier tokens.
-#! Placeholders for literal text in the grammar that lex as other things.
-contextual-override := IDENTIFIER
-contextual-final := IDENTIFIER
-contextual-zero := NUMERIC_CONSTANT
-module-keyword := IDENTIFIER
-import-keyword := IDENTIFIER
-export-keyword := IDENTIFIER
+++ /dev/null
-// verify clang/lib/Tooling/Syntax/Pseudo/cxx.bnf
-// RUN: clang-pseudo -grammar=%cxx-bnf-file
+++ /dev/null
-int is_debug() {
-#ifndef NDEBUG
- return 1; // in debug mode
-#else
- return 0;
-#endif
-}
-
-/* This comment gets lexed along with the input above! We just don't CHECK it.
-
-RUN: clang-pseudo -source %s -print-source | FileCheck %s -check-prefix=SOURCE --strict-whitespace
- SOURCE: int is_debug() {
-SOURCE-NEXT: #ifndef NDEBUG
-SOURCE-NEXT: return 1; // in debug mode
-SOURCE-NEXT: #else
-SOURCE-NEXT: return 0;
-SOURCE-NEXT: #end
-SOURCE-NEXT: }
-
-RUN: clang-pseudo -source %s -print-tokens | FileCheck %s -check-prefix=TOKEN
- TOKEN: 0: raw_identifier 0:0 "int" flags=1
-TOKEN-NEXT: raw_identifier 0:0 "is_debug"
-TOKEN-NEXT: l_paren 0:0 "("
-TOKEN-NEXT: r_paren 0:0 ")"
-TOKEN-NEXT: l_brace 0:0 "{"
-TOKEN-NEXT: hash 1:0 "#" flags=1
-TOKEN-NEXT: raw_identifier 1:0 "ifndef"
-TOKEN-NEXT: raw_identifier 1:0 "NDEBUG"
-TOKEN-NEXT: raw_identifier 2:2 "return" flags=1
-TOKEN-NEXT: numeric_constant 2:2 "1"
-TOKEN-NEXT: semi 2:2 ";"
-TOKEN-NEXT: comment 2:2 "// in debug mode"
-TOKEN-NEXT: hash 3:0 "#" flags=1
-TOKEN-NEXT: raw_identifier 3:0 "else"
-TOKEN-NEXT: raw_identifier 4:2 "return" flags=1
-TOKEN-NEXT: numeric_constant 4:2 "0"
-TOKEN-NEXT: semi 4:2 ";"
-TOKEN-NEXT: hash 5:0 "#" flags=1
-TOKEN-NEXT: raw_identifier 5:0 "endif"
-TOKEN-NEXT: r_brace 6:0 "}" flags=1
-
-RUN: clang-pseudo -source %s -print-directive-map | FileCheck %s -check-prefix=PPS --strict-whitespace
- PPS: code (5 tokens)
-PPS-NEXT: #ifndef (3 tokens)
-PPS-NEXT: code (4 tokens)
-PPS-NEXT: #else (2 tokens)
-PPS-NEXT: code (3 tokens)
-PPS-NEXT: #endif (2 tokens)
-PPS-NEXT: code (2 tokens)
- ^ including this block comment
-
-*******************************************************************************/
+++ /dev/null
-cxx_bnf_file = os.path.join(config.clang_src_dir, 'lib', 'Tooling', 'Syntax',
- 'Pseudo', 'cxx.bnf')
-config.substitutions.append(('%cxx-bnf-file',
- '%s' % (cxx_bnf_file)))
+++ /dev/null
-_ := expr
-expr := id
-id := IDENTIFIER
-
-# RUN: clang-pseudo -grammar %s -print-graph | FileCheck %s --check-prefix=GRAPH
-# GRAPH: States:
-# GRPAH-NEXT: State 0
-# GRPAH-NEXT: _ := • expr
-# GRPAH-NEXT: expr := • id
-# GRPAH-NEXT: id := • IDENTIFIER
-# GRPAH-NEXT: State 1
-# GRPAH-NEXT: _ := expr •
-# GRPAH-NEXT: State 2
-# GRPAH-NEXT: expr := id •
-# GRPAH-NEXT: State 3
-# GRPAH-NEXT: id := IDENTIFIER •
-
-# RUN: clang-pseudo -grammar %s -print-table | FileCheck %s --check-prefix=TABLE
-# TABLE: LRTable:
-# TABLE-NEXT: State 0
-# TABLE-NEXT: 'IDENTIFIER': shift state 3
-# TABLE-NEXT: 'expr': go to state 1
-# TABLE-NEXT: 'id': go to state 2
-# TABLE-NEXT: State 1
-# TABLE-NEXT: 'EOF': accept
-# TABLE-NEXT: State 2
-# TABLE-NEXT: 'EOF': reduce by rule 1 'expr := id'
-# TABLE-NEXT: State 3
-# TABLE-NEXT: 'EOF': reduce by rule 2 'id := IDENTIFIER'
+++ /dev/null
-_ := expr
-expr := expr - expr # S/R conflict at state 4 on '-' token
-expr := IDENTIFIER
-
-# RUN: clang-pseudo -grammar %s -print-graph | FileCheck %s --check-prefix=GRAPH
-# GRAPH: States
-# GRAPH-NEXT: State 0
-# GRAPH-NEXT: _ := • expr
-# GRAPH-NEXT: expr := • expr - expr
-# GRAPH-NEXT: expr := • IDENTIFIER
-# GRAPH-NEXT: State 1
-# GRAPH-NEXT: _ := expr •
-# GRAPH-NEXT: expr := expr • - expr
-# GRAPH-NEXT: State 2
-# GRAPH-NEXT: expr := IDENTIFIER •
-# GRAPH-NEXT: State 3
-# GRAPH-NEXT: expr := • expr - expr
-# GRAPH-NEXT: expr := expr - • expr
-# GRAPH-NEXT: expr := • IDENTIFIER
-# GRAPH-NEXT: State 4
-# GRAPH-NEXT: expr := expr - expr •
-# GRAPH-NEXT: expr := expr • - expr
-# GRAPH-NEXT: 0 ->[expr] 1
-# GRAPH-NEXT: 0 ->[IDENTIFIER] 2
-# GRAPH-NEXT: 1 ->[-] 3
-# GRAPH-NEXT: 3 ->[expr] 4
-# GRAPH-NEXT: 3 ->[IDENTIFIER] 2
-# GRAPH-NEXT: 4 ->[-] 3
-
-# RUN: clang-pseudo -grammar %s -print-table | FileCheck %s --check-prefix=TABLE
-# TABLE: LRTable:
-# TABLE-NEXT: State 0
-# TABLE-NEXT: 'IDENTIFIER': shift state 2
-# TABLE-NEXT: 'expr': go to state 1
-# TABLE-NEXT: State 1
-# TABLE-NEXT: 'EOF': accept
-# TABLE-NEXT: '-': shift state 3
-# TABLE-NEXT: State 2
-# TABLE-NEXT: 'EOF': reduce by rule 1 'expr := IDENTIFIER'
-# TABLE-NEXT: '-': reduce by rule 1 'expr := IDENTIFIER'
-# TABLE-NEXT: State 3
-# TABLE-NEXT: 'IDENTIFIER': shift state 2
-# TABLE-NEXT: 'expr': go to state 4
-# TABLE-NEXT: State 4
-# TABLE-NEXT: 'EOF': reduce by rule 2 'expr := expr - expr'
-# TABLE-NEXT: '-': shift state 3
-# TABLE-NEXT: '-': reduce by rule 2 'expr := expr - expr'
add_clang_subdirectory(clang-offload-wrapper)
add_clang_subdirectory(clang-scan-deps)
add_clang_subdirectory(clang-repl)
-add_clang_subdirectory(clang-pseudo)
add_clang_subdirectory(c-index-test)
+++ /dev/null
-set(LLVM_LINK_COMPONENTS support)
-
-add_clang_tool(clang-pseudo
- ClangPseudo.cpp
- )
-
-set(CLANG_PSEUDO_LIB_DEPS
- clangBasic
- clangToolingSyntaxPseudo
- )
-
-clang_target_link_libraries(clang-pseudo
- PRIVATE
- ${CLANG_PSEUDO_LIB_DEPS}
- )
+++ /dev/null
-//===-- ClangPseudo.cpp - Clang pseudo parser tool ------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "clang/Basic/LangOptions.h"
-#include "clang/Tooling/Syntax/Pseudo/DirectiveMap.h"
-#include "clang/Tooling/Syntax/Pseudo/Grammar.h"
-#include "clang/Tooling/Syntax/Pseudo/LRGraph.h"
-#include "clang/Tooling/Syntax/Pseudo/LRTable.h"
-#include "clang/Tooling/Syntax/Pseudo/Token.h"
-#include "llvm/ADT/StringExtras.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/FormatVariadic.h"
-#include "llvm/Support/MemoryBuffer.h"
-
-using clang::syntax::pseudo::Grammar;
-using llvm::cl::desc;
-using llvm::cl::init;
-using llvm::cl::opt;
-
-static opt<std::string>
- Grammar("grammar", desc("Parse and check a BNF grammar file."), init(""));
-static opt<bool> PrintGrammar("print-grammar", desc("Print the grammar."));
-static opt<bool> PrintGraph("print-graph",
- desc("Print the LR graph for the grammar"));
-static opt<bool> PrintTable("print-table",
- desc("Print the LR table for the grammar"));
-static opt<std::string> Source("source", desc("Source file"));
-static opt<bool> PrintSource("print-source", desc("Print token stream"));
-static opt<bool> PrintTokens("print-tokens", desc("Print detailed token info"));
-static opt<bool>
- PrintDirectiveMap("print-directive-map",
- desc("Print directive structure of source code"));
-
-static std::string readOrDie(llvm::StringRef Path) {
- llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> Text =
- llvm::MemoryBuffer::getFile(Path);
- if (std::error_code EC = Text.getError()) {
- llvm::errs() << "Error: can't read grammar file '" << Path
- << "': " << EC.message() << "\n";
- ::exit(1);
- }
- return Text.get()->getBuffer().str();
-}
-
-int main(int argc, char *argv[]) {
- llvm::cl::ParseCommandLineOptions(argc, argv, "");
-
- if (Grammar.getNumOccurrences()) {
- std::string Text = readOrDie(Grammar);
- std::vector<std::string> Diags;
- auto G = Grammar::parseBNF(Text, Diags);
-
- if (!Diags.empty()) {
- llvm::errs() << llvm::join(Diags, "\n");
- return 2;
- }
- llvm::outs() << llvm::formatv("grammar file {0} is parsed successfully\n",
- Grammar);
- if (PrintGrammar)
- llvm::outs() << G->dump();
- if (PrintGraph)
- llvm::outs() << clang::syntax::pseudo::LRGraph::buildLR0(*G).dumpForTests(
- *G);
- if (PrintTable)
- llvm::outs() << clang::syntax::pseudo::LRTable::buildSLR(*G).dumpForTests(
- *G);
- return 0;
- }
-
- if (Source.getNumOccurrences()) {
- std::string Text = readOrDie(Source);
- clang::LangOptions LangOpts; // FIXME: use real options.
- auto Stream = clang::syntax::pseudo::lex(Text, LangOpts);
- auto Structure = clang::syntax::pseudo::DirectiveMap::parse(Stream);
-
- if (PrintDirectiveMap)
- llvm::outs() << Structure;
- if (PrintSource)
- Stream.print(llvm::outs());
- if (PrintTokens)
- llvm::outs() << Stream;
- }
-
- return 0;
-}
PRIVATE
LLVMTestingSupport
)
-
-add_subdirectory(Pseudo)
+++ /dev/null
-set(LLVM_LINK_COMPONENTS
- Support
- )
-
-add_clang_unittest(ClangPseudoTests
- DirectiveMapTest.cpp
- GrammarTest.cpp
- LRTableTest.cpp
- TokenTest.cpp
-)
-
-clang_target_link_libraries(ClangPseudoTests
- PRIVATE
- clangBasic
- clangLex
- clangToolingSyntaxPseudo
- clangTesting
- )
-
-target_link_libraries(ClangPseudoTests
- PRIVATE
- LLVMTestingSupport
- )
+++ /dev/null
-//===--- DirectiveMapTest.cpp ---------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "clang/Tooling/Syntax/Pseudo/DirectiveMap.h"
-
-#include "clang/Basic/LangOptions.h"
-#include "clang/Basic/TokenKinds.h"
-#include "clang/Tooling/Syntax/Pseudo/Token.h"
-#include "llvm/ADT/StringExtras.h"
-#include "llvm/ADT/StringRef.h"
-#include "gmock/gmock.h"
-#include "gtest/gtest.h"
-
-namespace clang {
-namespace syntax {
-namespace pseudo {
-namespace {
-
-using testing::_;
-using testing::ElementsAre;
-using testing::Matcher;
-using testing::Pair;
-using testing::StrEq;
-using Chunk = DirectiveMap::Chunk;
-
-MATCHER_P2(tokensAre, TS, Tokens, "tokens are " + std::string(Tokens)) {
- std::vector<llvm::StringRef> Texts;
- for (const Token &Tok : TS.tokens(arg.Tokens))
- Texts.push_back(Tok.text());
- return Matcher<std::string>(StrEq(Tokens))
- .MatchAndExplain(llvm::join(Texts, " "), result_listener);
-}
-
-MATCHER_P(chunkKind, K, "") { return arg.kind() == K; }
-
-TEST(DirectiveMap, Parse) {
- LangOptions Opts;
- std::string Code = R"cpp(
- #include <foo.h>
-
- int main() {
- #ifdef HAS_FOO
- #if HAS_BAR
- foo(bar);
- #else
- foo(0)
- #endif
- #elif NEEDS_FOO
- #error missing_foo
- #endif
- }
- )cpp";
-
- TokenStream S = cook(lex(Code, Opts), Opts);
- DirectiveMap PP = DirectiveMap::parse(S);
-
- ASSERT_THAT(PP.Chunks, ElementsAre(chunkKind(Chunk::K_Directive),
- chunkKind(Chunk::K_Code),
- chunkKind(Chunk::K_Conditional),
- chunkKind(Chunk::K_Code)));
-
- EXPECT_THAT((const DirectiveMap::Directive &)PP.Chunks[0],
- tokensAre(S, "# include < foo . h >"));
- EXPECT_THAT((const DirectiveMap::Code &)PP.Chunks[1],
- tokensAre(S, "int main ( ) {"));
- EXPECT_THAT((const DirectiveMap::Code &)PP.Chunks[3], tokensAre(S, "}"));
-
- const DirectiveMap::Conditional &Ifdef(PP.Chunks[2]);
- EXPECT_THAT(Ifdef.Branches,
- ElementsAre(Pair(tokensAre(S, "# ifdef HAS_FOO"), _),
- Pair(tokensAre(S, "# elif NEEDS_FOO"), _)));
- EXPECT_THAT(Ifdef.End, tokensAre(S, "# endif"));
-
- const DirectiveMap &HasFoo(Ifdef.Branches[0].second);
- const DirectiveMap &NeedsFoo(Ifdef.Branches[1].second);
-
- EXPECT_THAT(HasFoo.Chunks, ElementsAre(chunkKind(Chunk::K_Conditional)));
- const DirectiveMap::Conditional &If(HasFoo.Chunks[0]);
- EXPECT_THAT(If.Branches, ElementsAre(Pair(tokensAre(S, "# if HAS_BAR"), _),
- Pair(tokensAre(S, "# else"), _)));
- EXPECT_THAT(If.Branches[0].second.Chunks,
- ElementsAre(chunkKind(Chunk::K_Code)));
- EXPECT_THAT(If.Branches[1].second.Chunks,
- ElementsAre(chunkKind(Chunk::K_Code)));
-
- EXPECT_THAT(NeedsFoo.Chunks, ElementsAre(chunkKind(Chunk::K_Directive)));
- const DirectiveMap::Directive &Error(NeedsFoo.Chunks[0]);
- EXPECT_THAT(Error, tokensAre(S, "# error missing_foo"));
- EXPECT_EQ(Error.Kind, tok::pp_error);
-}
-
-TEST(DirectiveMap, ParseUgly) {
- LangOptions Opts;
- std::string Code = R"cpp(
- /*A*/ # /*B*/ \
- /*C*/ \
-define \
-BAR /*D*/
-/*E*/
-)cpp";
- TokenStream S = cook(lex(Code, Opts), Opts);
- DirectiveMap PP = DirectiveMap::parse(S);
-
- ASSERT_THAT(PP.Chunks, ElementsAre(chunkKind(Chunk::K_Code),
- chunkKind(Chunk::K_Directive),
- chunkKind(Chunk::K_Code)));
- EXPECT_THAT((const DirectiveMap::Code &)PP.Chunks[0], tokensAre(S, "/*A*/"));
- const DirectiveMap::Directive &Define(PP.Chunks[1]);
- EXPECT_EQ(Define.Kind, tok::pp_define);
- EXPECT_THAT(Define, tokensAre(S, "# /*B*/ /*C*/ define BAR /*D*/"));
- EXPECT_THAT((const DirectiveMap::Code &)PP.Chunks[2], tokensAre(S, "/*E*/"));
-}
-
-TEST(DirectiveMap, ParseBroken) {
- LangOptions Opts;
- std::string Code = R"cpp(
- a
- #endif // mismatched
- #if X
- b
-)cpp";
- TokenStream S = cook(lex(Code, Opts), Opts);
- DirectiveMap PP = DirectiveMap::parse(S);
-
- ASSERT_THAT(PP.Chunks, ElementsAre(chunkKind(Chunk::K_Code),
- chunkKind(Chunk::K_Directive),
- chunkKind(Chunk::K_Conditional)));
- EXPECT_THAT((const DirectiveMap::Code &)PP.Chunks[0], tokensAre(S, "a"));
- const DirectiveMap::Directive &Endif(PP.Chunks[1]);
- EXPECT_EQ(Endif.Kind, tok::pp_endif);
- EXPECT_THAT(Endif, tokensAre(S, "# endif // mismatched"));
-
- const DirectiveMap::Conditional &X(PP.Chunks[2]);
- EXPECT_EQ(1u, X.Branches.size());
- // The (only) branch of the broken conditional section runs until eof.
- EXPECT_EQ(tok::pp_if, X.Branches.front().first.Kind);
- EXPECT_THAT(X.Branches.front().second.Chunks,
- ElementsAre(chunkKind(Chunk::K_Code)));
- // The missing terminating directive is marked as pp_not_keyword.
- EXPECT_EQ(tok::pp_not_keyword, X.End.Kind);
- EXPECT_EQ(0u, X.End.Tokens.size());
-}
-
-} // namespace
-} // namespace pseudo
-} // namespace syntax
-} // namespace clang
+++ /dev/null
-//===--- GrammarTest.cpp - grammar tests -----------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "clang/Tooling/Syntax/Pseudo/Grammar.h"
-#include "gmock/gmock.h"
-#include "gtest/gtest.h"
-#include <memory>
-
-namespace clang {
-namespace syntax {
-namespace pseudo {
-namespace {
-
-using testing::AllOf;
-using testing::ElementsAre;
-using testing::IsEmpty;
-using testing::Pair;
-using testing::UnorderedElementsAre;
-
-MATCHER_P(TargetID, SID, "") { return arg.Target == SID; }
-template <typename... T> testing::Matcher<const Rule &> Sequence(T... IDs) {
- return testing::Property(&Rule::seq, ElementsAre(IDs...));
-}
-
-class GrammarTest : public ::testing::Test {
-public:
- void build(llvm::StringRef BNF) {
- Diags.clear();
- G = Grammar::parseBNF(BNF, Diags);
- }
-
- SymbolID id(llvm::StringRef Name) const {
- for (unsigned I = 0; I < NumTerminals; ++I)
- if (G->table().Terminals[I] == Name)
- return tokenSymbol(static_cast<tok::TokenKind>(I));
- for (SymbolID ID = 0; ID < G->table().Nonterminals.size(); ++ID)
- if (G->table().Nonterminals[ID].Name == Name)
- return ID;
- ADD_FAILURE() << "No such symbol found: " << Name;
- return 0;
- }
-
-protected:
- std::unique_ptr<Grammar> G;
- std::vector<std::string> Diags;
-};
-
-TEST_F(GrammarTest, Basic) {
- build("_ := IDENTIFIER + _ # comment");
- EXPECT_THAT(Diags, IsEmpty());
-
- auto ExpectedRule =
- AllOf(TargetID(id("_")), Sequence(id("IDENTIFIER"), id("+"), id("_")));
- EXPECT_EQ(G->symbolName(id("_")), "_");
- EXPECT_THAT(G->rulesFor(id("_")), UnorderedElementsAre(ExpectedRule));
- const auto &Rule = G->lookupRule(/*RID=*/0);
- EXPECT_THAT(Rule, ExpectedRule);
- EXPECT_THAT(G->symbolName(Rule.seq()[0]), "IDENTIFIER");
- EXPECT_THAT(G->symbolName(Rule.seq()[1]), "+");
- EXPECT_THAT(G->symbolName(Rule.seq()[2]), "_");
-}
-
-TEST_F(GrammarTest, EliminatedOptional) {
- build("_ := CONST_opt INT ;_opt");
- EXPECT_THAT(Diags, IsEmpty());
- EXPECT_THAT(G->table().Rules,
- UnorderedElementsAre(Sequence(id("INT")),
- Sequence(id("CONST"), id("INT")),
- Sequence(id("CONST"), id("INT"), id(";")),
- Sequence(id("INT"), id(";"))));
-}
-
-TEST_F(GrammarTest, Diagnostics) {
- build(R"cpp(
- _ := ,_opt
- _ := undefined-sym
- null :=
- _ := IDENFIFIE # a typo of the terminal IDENFITIER
-
- invalid
- )cpp");
-
- EXPECT_EQ(G->startSymbol(), id("_"));
- EXPECT_THAT(Diags, UnorderedElementsAre(
- "Rule '_ := ,_opt' has a nullable RHS",
- "Rule 'null := ' has a nullable RHS",
- "No rules for nonterminal: undefined-sym",
- "Failed to parse 'invalid': no separator :=",
- "Token-like name IDENFIFIE is used as a nonterminal",
- "No rules for nonterminal: IDENFIFIE"));
-}
-
-TEST_F(GrammarTest, FirstAndFollowSets) {
- build(
- R"bnf(
-_ := expr
-expr := expr - term
-expr := term
-term := IDENTIFIER
-term := ( expr )
-)bnf");
- ASSERT_TRUE(Diags.empty());
- auto ToPairs = [](std::vector<llvm::DenseSet<SymbolID>> Input) {
- std::vector<std::pair<SymbolID, llvm::DenseSet<SymbolID>>> Sets;
- for (SymbolID ID = 0; ID < Input.size(); ++ID)
- Sets.emplace_back(ID, std::move(Input[ID]));
- return Sets;
- };
-
- EXPECT_THAT(
- ToPairs(firstSets(*G)),
- UnorderedElementsAre(
- Pair(id("_"), UnorderedElementsAre(id("IDENTIFIER"), id("("))),
- Pair(id("expr"), UnorderedElementsAre(id("IDENTIFIER"), id("("))),
- Pair(id("term"), UnorderedElementsAre(id("IDENTIFIER"), id("(")))));
- EXPECT_THAT(
- ToPairs(followSets(*G)),
- UnorderedElementsAre(
- Pair(id("_"), UnorderedElementsAre(id("EOF"))),
- Pair(id("expr"), UnorderedElementsAre(id("-"), id("EOF"), id(")"))),
- Pair(id("term"), UnorderedElementsAre(id("-"), id("EOF"), id(")")))));
-
- build(R"bnf(
-# A simplfied C++ decl-specifier-seq.
-_ := decl-specifier-seq
-decl-specifier-seq := decl-specifier decl-specifier-seq
-decl-specifier-seq := decl-specifier
-decl-specifier := simple-type-specifier
-decl-specifier := INLINE
-simple-type-specifier := INT
- )bnf");
- ASSERT_TRUE(Diags.empty());
- EXPECT_THAT(
- ToPairs(firstSets(*G)),
- UnorderedElementsAre(
- Pair(id("_"), UnorderedElementsAre(id("INLINE"), id("INT"))),
- Pair(id("decl-specifier-seq"),
- UnorderedElementsAre(id("INLINE"), id("INT"))),
- Pair(id("simple-type-specifier"), UnorderedElementsAre(id("INT"))),
- Pair(id("decl-specifier"),
- UnorderedElementsAre(id("INLINE"), id("INT")))));
- EXPECT_THAT(
- ToPairs(followSets(*G)),
- UnorderedElementsAre(
- Pair(id("_"), UnorderedElementsAre(id("EOF"))),
- Pair(id("decl-specifier-seq"), UnorderedElementsAre(id("EOF"))),
- Pair(id("decl-specifier"),
- UnorderedElementsAre(id("INLINE"), id("INT"), id("EOF"))),
- Pair(id("simple-type-specifier"),
- UnorderedElementsAre(id("INLINE"), id("INT"), id("EOF")))));
-}
-
-} // namespace
-} // namespace pseudo
-} // namespace syntax
-} // namespace clang
+++ /dev/null
-//===--- LRTableTest.cpp - ---------------------------------------*- C++-*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "clang/Tooling/Syntax/Pseudo/LRTable.h"
-#include "clang/Basic/TokenKinds.h"
-#include "clang/Tooling/Syntax/Pseudo/Grammar.h"
-#include "gmock/gmock.h"
-#include "gtest/gtest.h"
-#include <vector>
-
-namespace clang {
-namespace syntax {
-namespace pseudo {
-namespace {
-
-using testing::IsEmpty;
-using testing::UnorderedElementsAre;
-using Action = LRTable::Action;
-
-TEST(LRTable, Builder) {
- GrammarTable GTable;
-
- // eof semi ...
- // +-------+----+-------+---
- // |state0 | | s0,r0 |...
- // |state1 | acc| |...
- // |state2 | | r1 |...
- // +-------+----+-------+---
- std::vector<LRTable::Entry> Entries = {
- {/* State */ 0, tokenSymbol(tok::semi), Action::shift(0)},
- {/* State */ 0, tokenSymbol(tok::semi), Action::reduce(0)},
- {/* State */ 1, tokenSymbol(tok::eof), Action::accept(2)},
- {/* State */ 2, tokenSymbol(tok::semi), Action::reduce(1)}};
- GrammarTable GT;
- LRTable T = LRTable::buildForTests(GT, Entries);
- EXPECT_THAT(T.find(0, tokenSymbol(tok::eof)), IsEmpty());
- EXPECT_THAT(T.find(0, tokenSymbol(tok::semi)),
- UnorderedElementsAre(Action::shift(0), Action::reduce(0)));
- EXPECT_THAT(T.find(1, tokenSymbol(tok::eof)),
- UnorderedElementsAre(Action::accept(2)));
- EXPECT_THAT(T.find(1, tokenSymbol(tok::semi)), IsEmpty());
- EXPECT_THAT(T.find(2, tokenSymbol(tok::semi)),
- UnorderedElementsAre(Action::reduce(1)));
- // Verify the behaivor for other non-available-actions terminals.
- EXPECT_THAT(T.find(2, tokenSymbol(tok::kw_int)), IsEmpty());
-}
-
-} // namespace
-} // namespace pseudo
-} // namespace syntax
-} // namespace clang
+++ /dev/null
-//===--- TokenTest.cpp ----------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "clang/Tooling/Syntax/Pseudo/Token.h"
-#include "clang/Basic/LangOptions.h"
-#include "clang/Basic/TokenKinds.h"
-#include "gmock/gmock.h"
-#include "gtest/gtest.h"
-
-namespace clang {
-namespace syntax {
-namespace pseudo {
-namespace {
-
-using testing::AllOf;
-using testing::ElementsAre;
-using testing::ElementsAreArray;
-using testing::Not;
-
-MATCHER_P2(token, Text, Kind, "") {
- return arg.Kind == Kind && arg.text() == Text;
-}
-
-MATCHER_P(hasFlag, Flag, "") { return arg.flag(Flag); }
-
-MATCHER_P2(lineIndent, Line, Indent, "") {
- return arg.Line == (unsigned)Line && arg.Indent == (unsigned)Indent;
-}
-
-TEST(TokenTest, Lex) {
- LangOptions Opts;
- std::string Code = R"cpp(
- #include <stdio.h>
- int main() {
- return 42; // the answer
- }
- )cpp";
- TokenStream Raw = lex(Code, Opts);
- ASSERT_TRUE(Raw.isFinalized());
- EXPECT_THAT(Raw.tokens(),
- ElementsAreArray({
- // Lexing of directives is weird, especially <angled> strings.
- token("#", tok::hash),
- token("include", tok::raw_identifier),
- token("<", tok::less),
- token("stdio", tok::raw_identifier),
- token(".", tok::period),
- token("h", tok::raw_identifier),
- token(">", tok::greater),
-
- token("int", tok::raw_identifier),
- token("main", tok::raw_identifier),
- token("(", tok::l_paren),
- token(")", tok::r_paren),
- token("{", tok::l_brace),
- token("return", tok::raw_identifier),
- token("42", tok::numeric_constant),
- token(";", tok::semi),
- token("// the answer", tok::comment),
- token("}", tok::r_brace),
- }));
-
- TokenStream Cooked = cook(Raw, Opts);
- ASSERT_TRUE(Cooked.isFinalized());
- EXPECT_THAT(Cooked.tokens(),
- ElementsAreArray({
- // Cooked identifier types in directives are not meaningful.
- token("#", tok::hash),
- token("include", tok::identifier),
- token("<", tok::less),
- token("stdio", tok::identifier),
- token(".", tok::period),
- token("h", tok::identifier),
- token(">", tok::greater),
-
- token("int", tok::kw_int),
- token("main", tok::identifier),
- token("(", tok::l_paren),
- token(")", tok::r_paren),
- token("{", tok::l_brace),
- token("return", tok::kw_return),
- token("42", tok::numeric_constant),
- token(";", tok::semi),
- token("// the answer", tok::comment),
- token("}", tok::r_brace),
- }));
- // Check raw tokens point back into original source code.
- EXPECT_EQ(Raw.tokens().front().text().begin(), &Code[Code.find('#')]);
-}
-
-TEST(TokenTest, LineContinuation) {
- LangOptions Opts;
- std::string Code = R"cpp(
-one_\
-token
-two \
-tokens
- )cpp";
- TokenStream Raw = lex(Code, Opts);
- EXPECT_THAT(
- Raw.tokens(),
- ElementsAre(AllOf(token("one_\\\ntoken", tok::raw_identifier),
- hasFlag(LexFlags::StartsPPLine),
- hasFlag(LexFlags::NeedsCleaning), lineIndent(1, 0)),
- AllOf(token("two", tok::raw_identifier),
- hasFlag(LexFlags::StartsPPLine),
- Not(hasFlag(LexFlags::NeedsCleaning))),
- AllOf(token("\\\ntokens", tok::raw_identifier),
- Not(hasFlag(LexFlags::StartsPPLine)),
- hasFlag(LexFlags::NeedsCleaning))));
-
- TokenStream Cooked = cook(Raw, Opts);
- EXPECT_THAT(
- Cooked.tokens(),
- ElementsAre(AllOf(token("one_token", tok::identifier), lineIndent(1, 0)),
- token("two", tok::identifier),
- token("tokens", tok::identifier)));
-}
-
-TEST(TokenTest, EncodedCharacters) {
- LangOptions Opts;
- Opts.Trigraphs = true;
- Opts.Digraphs = true;
- Opts.C99 = true; // UCNs
- Opts.CXXOperatorNames = true;
- std::string Code = R"(and <: ??! '??=' \u00E9)";
- TokenStream Raw = lex(Code, Opts);
- EXPECT_THAT(
- Raw.tokens(),
- ElementsAre( // and is not recognized as && until cook().
- AllOf(token("and", tok::raw_identifier),
- Not(hasFlag(LexFlags::NeedsCleaning))),
- // Digraphs are just different spellings of tokens.
- AllOf(token("<:", tok::l_square),
- Not(hasFlag(LexFlags::NeedsCleaning))),
- // Trigraps are interpreted, still need text cleaning.
- AllOf(token(R"(??!)", tok::pipe), hasFlag(LexFlags::NeedsCleaning)),
- // Trigraphs must be substituted inside constants too.
- AllOf(token(R"('??=')", tok::char_constant),
- hasFlag(LexFlags::NeedsCleaning)),
- // UCNs need substitution.
- AllOf(token(R"(\u00E9)", tok::raw_identifier),
- hasFlag(LexFlags::NeedsCleaning))));
-
- TokenStream Cooked = cook(Raw, Opts);
- EXPECT_THAT(
- Cooked.tokens(),
- ElementsAre(token("and", tok::ampamp), // alternate spelling recognized
- token("<:", tok::l_square),
- token("|", tok::pipe), // trigraph substituted
- token("'#'", tok::char_constant), // trigraph substituted
- token("é", tok::identifier))); // UCN substituted
-}
-
-TEST(TokenTest, Indentation) {
- LangOptions Opts;
- std::string Code = R"cpp( hello world
-no_indent \
- line_was_continued
-)cpp";
- TokenStream Raw = lex(Code, Opts);
- EXPECT_THAT(Raw.tokens(), ElementsAreArray({
- lineIndent(0, 3), // hello
- lineIndent(0, 3), // world
- lineIndent(1, 0), // no_indent
- lineIndent(2, 2), // line_was_continued
- }));
-}
-
-TEST(TokenTest, DropComments) {
- LangOptions Opts;
- std::string Code = R"cpp(
- // comment
- int /*abc*/;
-)cpp";
- TokenStream Raw = cook(lex(Code, Opts), Opts);
- TokenStream Stripped = stripComments(Raw);
- EXPECT_THAT(Raw.tokens(),
- ElementsAreArray(
- {token("// comment", tok::comment), token("int", tok::kw_int),
- token("/*abc*/", tok::comment), token(";", tok::semi)}));
-
- EXPECT_THAT(Stripped.tokens(), ElementsAreArray({token("int", tok::kw_int),
- token(";", tok::semi)}));
-}
-
-} // namespace
-} // namespace pseudo
-} // namespace syntax
-} // namespace clang