From a0f4c10ae227a62c2a63611e64eba83f0ff0f577 Mon Sep 17 00:00:00 2001 From: Sam McCall Date: Wed, 8 Jun 2022 23:27:23 +0200 Subject: [PATCH] [pseudo] Add error-recovery framework & brace-based recovery The idea is: - a parse failure is detected when all heads die when trying to shift the next token - we can recover by choosing a nonterminal we're partway through parsing, and determining where it ends through nonlocal means (e.g. matching brackets) - we can find candidates by walking up the stack from the (ex-)heads - the token range is defined using heuristics attached to grammar rules - the unparsed region is represented in the forest by an Opaque node This patch has the core GLR functionality. It does not allow recovery heuristics to be attached as extensions to the grammar, but rather infers a brace-based heuristic. Expected followups: - make recovery heuristics grammar extensions (depends on D127448) - add recover to our grammar for bracketed constructs and sequence nodes - change the structure of our augmented `_ := start` rules to eliminate some special-cases in glrParse. - (if I can work out how): avoid some spurious recovery cases described in comments - grammar changes to eliminate the hard distinction between init-list and designated-init-list shown in the recovery-init-list.cpp testcase Differential Revision: https://reviews.llvm.org/D128486 --- .../pseudo/include/clang-pseudo/GLR.h | 20 ++ .../pseudo/include/clang-pseudo/grammar/Grammar.h | 14 +- .../pseudo/include/clang-pseudo/grammar/LRGraph.h | 17 +- .../pseudo/include/clang-pseudo/grammar/LRTable.h | 31 ++- clang-tools-extra/pseudo/lib/GLR.cpp | 213 +++++++++++++++++++-- clang-tools-extra/pseudo/lib/grammar/Grammar.cpp | 7 +- .../pseudo/lib/grammar/GrammarBNF.cpp | 11 ++ clang-tools-extra/pseudo/lib/grammar/LRGraph.cpp | 37 +++- .../pseudo/lib/grammar/LRTableBuild.cpp | 28 ++- .../pseudo/test/cxx/empty-member-spec.cpp | 2 +- .../pseudo/test/cxx/recovery-init-list.cpp | 13 ++ clang-tools-extra/pseudo/unittests/GLRTest.cpp | 161 +++++++++++++++- 12 files changed, 516 insertions(+), 38 deletions(-) create mode 100644 clang-tools-extra/pseudo/test/cxx/recovery-init-list.cpp diff --git a/clang-tools-extra/pseudo/include/clang-pseudo/GLR.h b/clang-tools-extra/pseudo/include/clang-pseudo/GLR.h index a3e8611..edc4b2a 100644 --- a/clang-tools-extra/pseudo/include/clang-pseudo/GLR.h +++ b/clang-tools-extra/pseudo/include/clang-pseudo/GLR.h @@ -144,6 +144,26 @@ void glrShift(llvm::ArrayRef OldHeads, void glrReduce(std::vector &Heads, SymbolID Lookahead, const ParseParams &Params); +// Heuristically recover from a state where no further parsing is possible. +// +// OldHeads is the parse state at TokenIndex. +// This function consumes consumes zero or more tokens (advancing TokenIndex), +// and places any recovery states created in NewHeads. +// +// On failure, NewHeads is empty and TokenIndex is unchanged. +// +// WARNING: glrRecover acts as a "fallback shift". If it consumes no tokens, +// there is a risk of the parser falling into an infinite loop, creating an +// endless sequence of recovery nodes. +// Generally it is safe for recovery to match 0 tokens against sequence symbols +// like `statement-seq`, as the grammar won't permit another statement-seq +// immediately afterwards. However recovery strategies for `statement` should +// consume at least one token, as statements may be adjacent in the input. +void glrRecover(llvm::ArrayRef OldHeads, + unsigned &TokenIndex, const TokenStream &Tokens, + const ParseParams &Params, + std::vector &NewHeads); + } // namespace pseudo } // namespace clang diff --git a/clang-tools-extra/pseudo/include/clang-pseudo/grammar/Grammar.h b/clang-tools-extra/pseudo/include/clang-pseudo/grammar/Grammar.h index 382da41..7240f5a 100644 --- a/clang-tools-extra/pseudo/include/clang-pseudo/grammar/Grammar.h +++ b/clang-tools-extra/pseudo/include/clang-pseudo/grammar/Grammar.h @@ -81,9 +81,12 @@ inline tok::TokenKind symbolToToken(SymbolID SID) { assert(SID < NumTerminals); return static_cast(SID); } -inline SymbolID tokenSymbol(tok::TokenKind TK) { +inline constexpr SymbolID tokenSymbol(tok::TokenKind TK) { return TokenFlag | static_cast(TK); } +// Error recovery strategies. +// FIXME: these should be provided as extensions instead. +enum class RecoveryStrategy : uint8_t { None, Braces }; // An extension is a piece of native code specific to a grammar that modifies // the behavior of annotated rules. One ExtensionID is assigned for each unique @@ -107,7 +110,7 @@ struct Rule { // length to 9 (this is the longest sequence in cxx grammar). static constexpr unsigned SizeBits = 4; static constexpr unsigned MaxElements = 9; - static_assert(MaxElements <= (1 << SizeBits), "Exceeds the maximum limit"); + static_assert(MaxElements < (1 << SizeBits), "Exceeds the maximum limit"); static_assert(SizeBits + SymbolBits <= 16, "Must be able to store symbol ID + size efficiently"); @@ -123,6 +126,13 @@ struct Rule { // being set for this rule. ExtensionID Guard = 0; + // Specifies the index within Sequence eligible for error recovery. + // Given stmt := { stmt-seq_opt }, if we fail to parse the stmt-seq then we + // should recover by finding the matching brace, and forcing stmt-seq to match + // everything between braces. + uint8_t RecoveryIndex = -1; + RecoveryStrategy Recovery = RecoveryStrategy::None; + llvm::ArrayRef seq() const { return llvm::ArrayRef(Sequence, Size); } diff --git a/clang-tools-extra/pseudo/include/clang-pseudo/grammar/LRGraph.h b/clang-tools-extra/pseudo/include/clang-pseudo/grammar/LRGraph.h index 1b53653..f5c2cc7 100644 --- a/clang-tools-extra/pseudo/include/clang-pseudo/grammar/LRGraph.h +++ b/clang-tools-extra/pseudo/include/clang-pseudo/grammar/LRGraph.h @@ -137,8 +137,20 @@ public: SymbolID Label; }; + // A possible error recovery: choose to match some tokens against a symbol. + // + // e.g. a state that contains + // stmt := { . stmt-seq [recover=braces] } + // has a Recovery { Src = S, Strategy=braces, Result=stmt-seq }. + struct Recovery { + StateID Src; // The state we are in when encountering the error. + RecoveryStrategy Strategy; // Heuristic choosing the tokens to match. + SymbolID Result; // The symbol that is produced. + }; + llvm::ArrayRef states() const { return States; } llvm::ArrayRef edges() const { return Edges; } + llvm::ArrayRef recoveries() const { return Recoveries; } llvm::ArrayRef> startStates() const { return StartStates; } @@ -147,12 +159,15 @@ public: private: LRGraph(std::vector States, std::vector Edges, + std::vector Recoveries, std::vector> StartStates) : States(std::move(States)), Edges(std::move(Edges)), - StartStates(std::move(StartStates)) {} + Recoveries(std::move(Recoveries)), StartStates(std::move(StartStates)) { + } std::vector States; std::vector Edges; + std::vector Recoveries; std::vector> StartStates; }; diff --git a/clang-tools-extra/pseudo/include/clang-pseudo/grammar/LRTable.h b/clang-tools-extra/pseudo/include/clang-pseudo/grammar/LRTable.h index 70ce529..c8e48db 100644 --- a/clang-tools-extra/pseudo/include/clang-pseudo/grammar/LRTable.h +++ b/clang-tools-extra/pseudo/include/clang-pseudo/grammar/LRTable.h @@ -121,6 +121,14 @@ public: uint16_t Value : ValueBits; }; + struct Recovery { + RecoveryStrategy Strategy; + SymbolID Result; + }; + + // Returns all available actions for the given state on a terminal. + // Expected to be called by LR parsers. + llvm::ArrayRef getActions(StateID State, SymbolID Terminal) const; // Returns the state after we reduce a nonterminal. // Expected to be called by LR parsers. // REQUIRES: Nonterminal is valid here. @@ -151,6 +159,12 @@ public: symbolToToken(Terminal)); } + // Looks up available recovery actions if we stopped parsing in this state. + llvm::ArrayRef getRecovery(StateID State) const { + return llvm::makeArrayRef(Recoveries.data() + RecoveryOffset[State], + Recoveries.data() + RecoveryOffset[State + 1]); + } + // Returns the state from which the LR parser should start to parse the input // tokens as the given StartSymbol. // @@ -188,9 +202,15 @@ public: StateID State; RuleID Rule; }; - // Build a specifid table for testing purposes. - static LRTable buildForTests(const Grammar &G, llvm::ArrayRef, - llvm::ArrayRef); + struct RecoveryEntry { + StateID State; + RecoveryStrategy Strategy; + SymbolID Result; + }; + // Build a specified table for testing purposes. + static LRTable buildForTests(const Grammar &, llvm::ArrayRef, + llvm::ArrayRef, + llvm::ArrayRef = {}); private: // Looks up actions stored in the generic table. @@ -222,6 +242,11 @@ private: // This is flattened by encoding the (SymbolID Nonterminal, tok::Kind Token) // as an index: Nonterminal * NUM_TOKENS + Token. llvm::BitVector FollowSets; + + // Recovery stores all recovery actions from all states. + // A given state has [RecoveryOffset[S], RecoveryOffset[S+1]). + std::vector RecoveryOffset; + std::vector Recoveries; }; llvm::raw_ostream &operator<<(llvm::raw_ostream &, const LRTable::Action &); diff --git a/clang-tools-extra/pseudo/lib/GLR.cpp b/clang-tools-extra/pseudo/lib/GLR.cpp index 0b9cf46..ccb3cd7 100644 --- a/clang-tools-extra/pseudo/lib/GLR.cpp +++ b/clang-tools-extra/pseudo/lib/GLR.cpp @@ -24,6 +24,156 @@ namespace clang { namespace pseudo { +namespace { + +llvm::Optional +findRecoveryEndpoint(RecoveryStrategy Strategy, + const GSS::Node *RecoveryNode, + const TokenStream &Tokens) { + assert(Strategy == RecoveryStrategy::Braces); + const ForestNode *LBrace = RecoveryNode->Payload; + assert(LBrace->kind() == ForestNode::Terminal && + LBrace->symbol() == tokenSymbol(tok::l_brace)); + if (const Token *RBrace = Tokens.tokens()[LBrace->startTokenIndex()].pair()) + return Tokens.index(*RBrace); + return llvm::None; +} + +} // namespace + +void glrRecover(llvm::ArrayRef OldHeads, + unsigned &TokenIndex, const TokenStream &Tokens, + const ParseParams &Params, + std::vector &NewHeads) { + LLVM_DEBUG(llvm::dbgs() << "Recovery at token " << TokenIndex << "...\n"); + // Describes a possibility to recover by forcibly interpreting a range of + // tokens around the cursor as a nonterminal that we expected to see. + struct PlaceholderRecovery { + // The token prior to the nonterminal which is being recovered. + // This starts of the region we're skipping, so higher Position is better. + Token::Index Position; + // The nonterminal which will be created in order to recover. + SymbolID Symbol; + // The heuristic used to choose the bounds of the nonterminal to recover. + RecoveryStrategy Strategy; + + // The GSS head where we are expecting the recovered nonterminal. + const GSS::Node *RecoveryNode; + // Payload of nodes on the way back from the OldHead to the recovery node. + // These represent the partial parse that is being discarded. + // They should become the children of the opaque recovery node. + // + // There may be multiple paths leading to the same recovery node, we choose + // one arbitrarily. + std::vector Path; + }; + std::vector Options; + + // Find recovery options by walking up the stack. + // + // This is similar to exception handling: we walk up the "frames" of nested + // rules being parsed until we find one that has a "handler" which allows us + // to determine the node bounds without parsing it. + // + // Unfortunately there's a significant difference: the stack contains both + // "upward" nodes (ancestor parses) and "leftward" ones. + // e.g. when parsing `int(2 + ?)`, the stack contains: + // expr := expr + . expr - which we're currently parsing + // expr := type ( . expr ) - (up) we should recover this outer expr + // expr := . type ( expr ) - (up+left) we should not recover this type! + // + // It's not obvious how to avoid collecting the latter as a recovery option. + // I think the distinction is ill-defined after merging items into states. + // For now, we have to take this into account when defining recovery rules. + // FIXME: find a more satisfying way to avoid such false recovery. + std::vector Path; + llvm::DenseSet Seen; + auto DFS = [&](const GSS::Node *N, Token::Index NextTok, auto &DFS) { + if (!Seen.insert(N).second) + return; + for (auto Strategy : Params.Table.getRecovery(N->State)) { + Options.push_back(PlaceholderRecovery{ + NextTok, + Strategy.Result, + Strategy.Strategy, + N, + Path, + }); + LLVM_DEBUG(llvm::dbgs() + << "Option: recover " << Params.G.symbolName(Strategy.Result) + << " at token " << NextTok << "\n"); + } + Path.push_back(N->Payload); + for (const GSS::Node *Parent : N->parents()) + DFS(Parent, N->Payload->startTokenIndex(), DFS); + Path.pop_back(); + }; + for (auto *N : llvm::reverse(OldHeads)) + DFS(N, TokenIndex, DFS); + + // Now we select the option(s) we will use to recover. + // + // We prefer options starting further right, as these discard less code + // (e.g. we prefer to recover inner scopes rather than outer ones). + // The options also need to agree on an endpoint, so the parser has a + // consistent position afterwards. + // + // So conceptually we're sorting by the tuple (start, end), though we avoid + // computing `end` for options that can't be winners. + + // Consider options starting further right first. + // Don't drop the others yet though, we may still use them if preferred fails. + llvm::stable_sort(Options, [&](const auto &L, const auto &R) { + return L.Position > R.Position; + }); + + assert(NewHeads.empty()); // We may repeatedly populate and clear it. + llvm::Optional RecoveryRange; + for (const PlaceholderRecovery &Option : Options) { + // If this starts further right than options we've already found, then + // we'll never find anything better. Skip computing End for the rest. + if (RecoveryRange && Option.Position < RecoveryRange->Begin) + break; + + auto End = + findRecoveryEndpoint(Option.Strategy, Option.RecoveryNode, Tokens); + // Only consider recovery that advances the parse. + if (!End || *End <= TokenIndex) + continue; + if (RecoveryRange) { + // If this is worse than our previous options, ignore it. + if (RecoveryRange->End < *End) + continue; + // If this is an improvement over our previous options, then drop them. + if (RecoveryRange->End > *End) + NewHeads.clear(); + } + // Create recovery nodes and heads for them in the GSS. These may be + // discarded if a better recovery is later found, but this path isn't hot. + RecoveryRange = {Option.Position, *End}; + const ForestNode &Placeholder = + Params.Forest.createOpaque(Option.Symbol, Option.Position); + const GSS::Node *NewHead = Params.GSStack.addNode( + Params.Table.getGoToState(Option.RecoveryNode->State, Option.Symbol), + &Placeholder, {Option.RecoveryNode}); + NewHeads.push_back(NewHead); + } + + // Advance the cursor, whether recovery succeeded or not. + if (RecoveryRange) { + LLVM_DEBUG({ + llvm::dbgs() << "Recovered range=" << *RecoveryRange << ":"; + for (const auto *Head : NewHeads) + llvm::dbgs() << " " << Params.G.symbolName(Head->Payload->symbol()); + llvm::dbgs() << "\n"; + }); + TokenIndex = RecoveryRange->End; + } else { + LLVM_DEBUG(llvm::dbgs() << "Recovery failed after trying " << Options.size() + << " strategies\n"); + ++TokenIndex; + } +} using StateID = LRTable::StateID; @@ -31,8 +181,9 @@ llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const GSS::Node &N) { std::vector ParentStates; for (const auto *Parent : N.parents()) ParentStates.push_back(llvm::formatv("{0}", Parent->State)); - OS << llvm::formatv("state {0}, parsed symbol {1}, parents {2}", N.State, - N.Payload->symbol(), llvm::join(ParentStates, ", ")); + OS << llvm::formatv("state {0}, parsed symbol {1}, parents {3}", N.State, + N.Payload ? N.Payload->symbol() : 0, + llvm::join(ParentStates, ", ")); return OS; } @@ -427,15 +578,27 @@ const ForestNode &glrParse(const TokenStream &Tokens, const ParseParams &Params, GSS.gc(std::move(Roots)); }; // Each iteration fully processes a single token. - for (unsigned I = 0; I < Terminals.size(); ++I) { + for (unsigned I = 0; I < Terminals.size();) { LLVM_DEBUG(llvm::dbgs() << llvm::formatv( "Next token {0} (id={1})\n", G.symbolName(Terminals[I].symbol()), Terminals[I].symbol())); // Consume the token. glrShift(Heads, Terminals[I], Params, NextHeads); + + // If we weren't able to consume the token, try to skip over some tokens + // so we can keep parsing. + if (NextHeads.empty()) { + glrRecover(Heads, I, Tokens, Params, NextHeads); + if (NextHeads.empty()) + // FIXME: Ensure the `_ := start-symbol` rules have a fallback + // error-recovery strategy attached. Then this condition can't happen. + return Params.Forest.createOpaque(StartSymbol, /*Token::Index=*/0); + } else + ++I; + // Form nonterminals containing the token we just consumed. - SymbolID Lookahead = I + 1 == Terminals.size() ? tokenSymbol(tok::eof) - : Terminals[I + 1].symbol(); + SymbolID Lookahead = + I == Terminals.size() ? tokenSymbol(tok::eof) : Terminals[I].symbol(); Reduce(NextHeads, Lookahead); // Prepare for the next token. std::swap(Heads, NextHeads); @@ -444,22 +607,35 @@ const ForestNode &glrParse(const TokenStream &Tokens, const ParseParams &Params, } LLVM_DEBUG(llvm::dbgs() << llvm::formatv("Reached eof\n")); + // The parse was successful if we're in state `_ := start-symbol .` StateID AcceptState = Params.Table.getGoToState(StartState, StartSymbol); - const ForestNode *Result = nullptr; - for (const auto *Head : Heads) { - if (Head->State == AcceptState) { - assert(Head->Payload->symbol() == StartSymbol); - assert(Result == nullptr && "multiple results!"); - Result = Head->Payload; + auto SearchForAccept = [&](llvm::ArrayRef Heads) { + const ForestNode *Result = nullptr; + for (const auto *Head : Heads) { + if (Head->State == AcceptState) { + assert(Head->Payload->symbol() == StartSymbol); + assert(Result == nullptr && "multiple results!"); + Result = Head->Payload; + } } - } - if (Result) + return Result; + }; + if (auto *Result = SearchForAccept(Heads)) return *Result; + // Failed to parse the input, attempt to run recovery. + // FIXME: this awkwardly repeats the recovery in the loop, when shift fails. + // More elegant is to include EOF in the token stream, and make the + // augmented rule: `_ := translation-unit EOF`. In this way recovery at EOF + // would not be a special case: it show up as a failure to shift the EOF + // token. + unsigned I = Terminals.size(); + glrRecover(Heads, I, Tokens, Params, NextHeads); + Reduce(NextHeads, tokenSymbol(tok::eof)); + if (auto *Result = SearchForAccept(NextHeads)) + return *Result; + // We failed to parse the input, returning an opaque forest node for recovery. - // - // FIXME: We will need to invoke our generic error-recovery handlers when we - // reach EOF without reaching accept state, and involving the eof - // token in the above main for-loopmay be the best way to reuse the code). + // FIXME: as above, we can add fallback error handling so this is impossible. return Params.Forest.createOpaque(StartSymbol, /*Token::Index=*/0); } @@ -470,9 +646,10 @@ void glrReduce(std::vector &Heads, SymbolID Lookahead, } const GSS::Node *GSS::addNode(LRTable::StateID State, const ForestNode *Symbol, + llvm::ArrayRef Parents) { Node *Result = new (allocate(Parents.size())) - Node({State, GCParity, static_cast(Parents.size())}); + Node({State, GCParity, static_cast(Parents.size())}); Alive.push_back(Result); ++NodesCreated; Result->Payload = Symbol; diff --git a/clang-tools-extra/pseudo/lib/grammar/Grammar.cpp b/clang-tools-extra/pseudo/lib/grammar/Grammar.cpp index 8e3bcb7..dc4d958 100644 --- a/clang-tools-extra/pseudo/lib/grammar/Grammar.cpp +++ b/clang-tools-extra/pseudo/lib/grammar/Grammar.cpp @@ -59,8 +59,11 @@ std::string Grammar::dumpRule(RuleID RID) const { llvm::raw_string_ostream OS(Result); const Rule &R = T->Rules[RID]; OS << symbolName(R.Target) << " :="; - for (SymbolID SID : R.seq()) - OS << " " << symbolName(SID); + for (unsigned I = 0; I < R.Size; ++I) { + OS << " " << symbolName(R.Sequence[I]); + if (R.RecoveryIndex == I) + OS << " [recover=" << static_cast(R.Recovery) << "]"; + } if (R.Guard) OS << " [guard=" << T->AttributeValues[R.Guard] << "]"; return Result; diff --git a/clang-tools-extra/pseudo/lib/grammar/GrammarBNF.cpp b/clang-tools-extra/pseudo/lib/grammar/GrammarBNF.cpp index 9fbc34d..281c086 100644 --- a/clang-tools-extra/pseudo/lib/grammar/GrammarBNF.cpp +++ b/clang-tools-extra/pseudo/lib/grammar/GrammarBNF.cpp @@ -106,6 +106,17 @@ public: assert(T->Rules.size() < (1 << RuleBits) && "Too many rules to fit in RuleID bits!"); + // Wherever RHS contains { foo }, mark foo for brace-recovery. + // FIXME: this should be grammar annotations instead. + for (auto &Rule : T->Rules) { + for (unsigned I = 2; I < Rule.Size; ++I) + if (Rule.Sequence[I] == tokenSymbol(tok::r_brace) && + Rule.Sequence[I - 2] == tokenSymbol(tok::l_brace) && + !isToken(Rule.Sequence[I - 1])) { + Rule.Recovery = RecoveryStrategy::Braces; + Rule.RecoveryIndex = I - 1; + } + } const auto &SymbolOrder = getTopologicalOrder(T.get()); llvm::stable_sort( T->Rules, [&SymbolOrder](const Rule &Left, const Rule &Right) { diff --git a/clang-tools-extra/pseudo/lib/grammar/LRGraph.cpp b/clang-tools-extra/pseudo/lib/grammar/LRGraph.cpp index 5312a67..290a4f3 100644 --- a/clang-tools-extra/pseudo/lib/grammar/LRGraph.cpp +++ b/clang-tools-extra/pseudo/lib/grammar/LRGraph.cpp @@ -120,6 +120,20 @@ nextAvailableKernelItems(const State &S, const Grammar &G) { return Results; } +std::vector> +availableRecovery(const State &S, const Grammar &G) { + std::vector> Result; + for (const Item &I : S.Items) { + const auto &Rule = G.lookupRule(I.rule()); + if (I.dot() != Rule.RecoveryIndex) + continue; + Result.push_back({Rule.Recovery, Rule.seq()[Rule.RecoveryIndex]}); + } + llvm::sort(Result); + Result.erase(std::unique(Result.begin(), Result.end()), Result.end()); + return Result; +} + } // namespace std::string Item::dump(const Grammar &G) const { @@ -130,9 +144,10 @@ std::string Item::dump(const Grammar &G) const { Results.push_back(G.symbolName(SID)); return Results; }; - return llvm::formatv("{0} := {1} • {2}", G.symbolName(Rule.Target), + return llvm::formatv("{0} := {1} • {2}{3}", G.symbolName(Rule.Target), llvm::join(ToNames(Rule.seq().take_front(DotPos)), " "), - llvm::join(ToNames(Rule.seq().drop_front(DotPos)), " ")) + llvm::join(ToNames(Rule.seq().drop_front(DotPos)), " "), + Rule.RecoveryIndex == DotPos ? " [recovery]" : "") .str(); } @@ -181,6 +196,11 @@ LRGraph LRGraph::buildLR0(const Grammar &G) { Edges.push_back({Src, Dst, Label}); } + void insertRecovery(StateID Src, RecoveryStrategy Strategy, + SymbolID Result) { + Recoveries.push_back({Src, Strategy, Result}); + } + // Returns a state with the given id. const State &find(StateID ID) const { assert(ID < States.size()); @@ -194,9 +214,10 @@ LRGraph LRGraph::buildLR0(const Grammar &G) { LRGraph build() && { States.shrink_to_fit(); Edges.shrink_to_fit(); + Recoveries.shrink_to_fit(); llvm::sort(StartStates); StartStates.shrink_to_fit(); - return LRGraph(std::move(States), std::move(Edges), + return LRGraph(std::move(States), std::move(Edges), std::move(Recoveries), std::move(StartStates)); } @@ -205,6 +226,7 @@ LRGraph LRGraph::buildLR0(const Grammar &G) { llvm::DenseMap StatesIndex; std::vector States; std::vector Edges; + std::vector Recoveries; const Grammar &G; std::vector> StartStates; } Builder(G); @@ -225,15 +247,16 @@ LRGraph LRGraph::buildLR0(const Grammar &G) { } while (!PendingStates.empty()) { - auto CurrentStateID = PendingStates.back(); + auto StateID = PendingStates.back(); PendingStates.pop_back(); - for (auto Next : - nextAvailableKernelItems(Builder.find(CurrentStateID), G)) { + for (auto Next : nextAvailableKernelItems(Builder.find(StateID), G)) { auto Insert = Builder.insert(Next.second); if (Insert.second) // new state, insert to the pending queue. PendingStates.push_back(Insert.first); - Builder.insertEdge(CurrentStateID, Insert.first, Next.first); + Builder.insertEdge(StateID, Insert.first, Next.first); } + for (auto Recovery : availableRecovery(Builder.find(StateID), G)) + Builder.insertRecovery(StateID, Recovery.first, Recovery.second); } return std::move(Builder).build(); } diff --git a/clang-tools-extra/pseudo/lib/grammar/LRTableBuild.cpp b/clang-tools-extra/pseudo/lib/grammar/LRTableBuild.cpp index 59ea4ce..3d36042 100644 --- a/clang-tools-extra/pseudo/lib/grammar/LRTableBuild.cpp +++ b/clang-tools-extra/pseudo/lib/grammar/LRTableBuild.cpp @@ -11,6 +11,7 @@ #include "clang-pseudo/grammar/LRTable.h" #include "clang/Basic/TokenKinds.h" #include "llvm/ADT/SmallSet.h" +#include "llvm/Support/raw_ostream.h" #include namespace llvm { @@ -44,6 +45,7 @@ struct LRTable::Builder { llvm::DenseSet Entries; llvm::DenseMap> Reduces; std::vector> FollowSets; + std::vector Recoveries; LRTable build(unsigned NumStates) && { // E.g. given the following parsing table with 3 states and 3 terminals: @@ -88,6 +90,26 @@ struct LRTable::Builder { } Table.StartStates = std::move(StartStates); + // Error recovery entries: sort (no dups already), and build offset lookup. + llvm::sort(Recoveries, + [&](const LRGraph::Recovery &L, const LRGraph::Recovery &R) { + return std::tie(L.Src, L.Result, L.Strategy) < + std::tie(R.Src, R.Result, R.Strategy); + }); + Table.Recoveries.reserve(Recoveries.size()); + for (const auto &R : Recoveries) + Table.Recoveries.push_back({R.Strategy, R.Result}); + Table.RecoveryOffset = std::vector(NumStates + 1, 0); + SortedIndex = 0; + for (StateID State = 0; State < NumStates; ++State) { + Table.RecoveryOffset[State] = SortedIndex; + while (SortedIndex < Recoveries.size() && + Recoveries[SortedIndex].Src == State) + SortedIndex++; + } + Table.RecoveryOffset[NumStates] = SortedIndex; + assert(SortedIndex == Recoveries.size()); + // Compile the follow sets into a bitmap. Table.FollowSets.resize(tok::NUM_TOKENS * FollowSets.size()); for (SymbolID NT = 0; NT < FollowSets.size(); ++NT) @@ -114,7 +136,8 @@ struct LRTable::Builder { }; LRTable LRTable::buildForTests(const Grammar &G, llvm::ArrayRef Entries, - llvm::ArrayRef Reduces) { + llvm::ArrayRef Reduces, + llvm::ArrayRef Recoveries) { StateID MaxState = 0; for (const auto &Entry : Entries) { MaxState = std::max(MaxState, Entry.State); @@ -128,6 +151,8 @@ LRTable LRTable::buildForTests(const Grammar &G, llvm::ArrayRef Entries, for (const ReduceEntry &E : Reduces) Build.Reduces[E.State].insert(E.Rule); Build.FollowSets = followSets(G); + for (const auto &R : Recoveries) + Build.Recoveries.push_back({R.State, R.Strategy, R.Result}); return std::move(Build).build(/*NumStates=*/MaxState + 1); } @@ -135,6 +160,7 @@ LRTable LRTable::buildSLR(const Grammar &G) { auto Graph = LRGraph::buildLR0(G); Builder Build; Build.StartStates = Graph.startStates(); + Build.Recoveries = Graph.recoveries(); for (const auto &T : Graph.edges()) { Action Act = isToken(T.Label) ? Action::shift(T.Dst) : Action::goTo(T.Dst); Build.Entries.insert({T.Src, T.Label, Act}); diff --git a/clang-tools-extra/pseudo/test/cxx/empty-member-spec.cpp b/clang-tools-extra/pseudo/test/cxx/empty-member-spec.cpp index 6d7a682..bcfdff1 100644 --- a/clang-tools-extra/pseudo/test/cxx/empty-member-spec.cpp +++ b/clang-tools-extra/pseudo/test/cxx/empty-member-spec.cpp @@ -2,7 +2,7 @@ class Foo { public: }; -// CHECK: decl-specifier-seq~class-specifier := class-head { member-specification } +// CHECK: decl-specifier-seq~class-specifier := class-head { member-specification [recover=1] } // CHECK-NEXT: ├─class-head := class-key class-head-name // CHECK-NEXT: │ ├─class-key~CLASS := tok[0] // CHECK-NEXT: │ └─class-head-name~IDENTIFIER := tok[1] diff --git a/clang-tools-extra/pseudo/test/cxx/recovery-init-list.cpp b/clang-tools-extra/pseudo/test/cxx/recovery-init-list.cpp new file mode 100644 index 0000000..a8bbe88 --- /dev/null +++ b/clang-tools-extra/pseudo/test/cxx/recovery-init-list.cpp @@ -0,0 +1,13 @@ +// RUN: clang-pseudo -grammar=%cxx-bnf-file -source=%s --print-forest | FileCheck %s +auto x = { complete garbage }; +// CHECK: translation-unit~simple-declaration +// CHECK-NEXT: ├─decl-specifier-seq~AUTO := tok[0] +// CHECK-NEXT: ├─init-declarator-list~init-declarator +// CHECK-NEXT: │ ├─declarator~IDENTIFIER := tok[1] +// CHECK-NEXT: │ └─initializer~brace-or-equal-initializer +// CHECK-NEXT: │ ├─= := tok[2] +// CHECK-NEXT: │ └─initializer-clause~braced-init-list +// CHECK-NEXT: │ ├─{ := tok[3] +// CHECK-NEXT: │ ├─initializer-list := +// CHECK-NEXT: │ └─} := tok[6] +// CHECK-NEXT: └─; := tok[7] diff --git a/clang-tools-extra/pseudo/unittests/GLRTest.cpp b/clang-tools-extra/pseudo/unittests/GLRTest.cpp index 846f2db..a37fda3 100644 --- a/clang-tools-extra/pseudo/unittests/GLRTest.cpp +++ b/clang-tools-extra/pseudo/unittests/GLRTest.cpp @@ -7,6 +7,7 @@ //===----------------------------------------------------------------------===// #include "clang-pseudo/GLR.h" +#include "clang-pseudo/Bracket.h" #include "clang-pseudo/Token.h" #include "clang-pseudo/grammar/Grammar.h" #include "clang/Basic/LangOptions.h" @@ -32,11 +33,13 @@ namespace { using Action = LRTable::Action; using testing::AllOf; using testing::ElementsAre; +using testing::IsEmpty; using testing::UnorderedElementsAre; MATCHER_P(state, StateID, "") { return arg->State == StateID; } MATCHER_P(parsedSymbol, FNode, "") { return arg->Payload == FNode; } MATCHER_P(parsedSymbolID, SID, "") { return arg->Payload->symbol() == SID; } +MATCHER_P(start, Start, "") { return arg->Payload->startTokenIndex() == Start; } testing::Matcher parents(llvm::ArrayRef Parents) { @@ -238,9 +241,9 @@ TEST_F(GLRTest, ReduceJoiningWithMultipleBases) { /*State=*/1, /*ForestNode=*/CVQualifierNode, /*Parents=*/{GSSNode0}); const auto *GSSNode2 = GSStack.addNode( /*State=*/2, /*ForestNode=*/CVQualifierNode, /*Parents=*/{GSSNode0}); - const auto *GSSNode3 = - GSStack.addNode(/*State=*/3, /*ForestNode=*/ClassNameNode, - /*Parents=*/{GSSNode1}); + const auto *GSSNode3 = GSStack.addNode( + /*State=*/3, /*ForestNode=*/ClassNameNode, + /*Parents=*/{GSSNode1}); const auto *GSSNode4 = GSStack.addNode(/*State=*/4, /*ForestNode=*/EnumNameNode, /*Parents=*/{GSSNode2}); @@ -363,6 +366,124 @@ TEST_F(GLRTest, ReduceLookahead) { EXPECT_THAT(Heads, ElementsAre(GSSNode1)); } +TEST_F(GLRTest, Recover) { + // Recovery while parsing "word" inside braces. + // Before: + // 0--1({)--2(?) + // After recovering a `word` at state 1: + // 0--3(word) // 3 is goto(1, word) + buildGrammar({"word"}, {}); + LRTable Table = LRTable::buildForTests( + G, {{/*State=*/1, id("word"), Action::goTo(3)}}, /*Reduce=*/{}, + /*Recovery=*/{{/*State=*/1, RecoveryStrategy::Braces, id("word")}}); + + auto *LBrace = &Arena.createTerminal(tok::l_brace, 0); + auto *Question1 = &Arena.createTerminal(tok::question, 1); + const auto *Root = GSStack.addNode(0, nullptr, {}); + const auto *OpenedBraces = GSStack.addNode(1, LBrace, {Root}); + const auto *AfterQuestion1 = GSStack.addNode(2, Question1, {OpenedBraces}); + + // Need a token stream with paired braces so the strategy works. + clang::LangOptions LOptions; + TokenStream Tokens = cook(lex("{ ? ? ? }", LOptions), LOptions); + pairBrackets(Tokens); + std::vector NewHeads; + + unsigned TokenIndex = 2; + glrRecover({AfterQuestion1}, TokenIndex, Tokens, {G, Table, Arena, GSStack}, + NewHeads); + EXPECT_EQ(TokenIndex, 4u) << "should skip ahead to matching brace"; + EXPECT_THAT(NewHeads, ElementsAre( + AllOf(state(3), parsedSymbolID(id("word")), + parents({OpenedBraces}), start(1u)))); + EXPECT_EQ(NewHeads.front()->Payload->kind(), ForestNode::Opaque); + + // Test recovery failure: omit closing brace so strategy fails + TokenStream NoRBrace = cook(lex("{ ? ? ? ?", LOptions), LOptions); + pairBrackets(NoRBrace); + NewHeads.clear(); + TokenIndex = 2; + glrRecover({AfterQuestion1}, TokenIndex, NoRBrace, + {G, Table, Arena, GSStack}, NewHeads); + EXPECT_EQ(TokenIndex, 3u) << "should advance by 1 by default"; + EXPECT_THAT(NewHeads, IsEmpty()); +} + +TEST_F(GLRTest, RecoverRightmost) { + // In a nested block structure, we recover at the innermost possible block. + // Before: + // 0--1({)--1({)--1({) + // After recovering a `block` at inside the second braces: + // 0--1({)--2(body) // 2 is goto(1, body) + buildGrammar({"body"}, {}); + LRTable Table = LRTable::buildForTests( + G, {{/*State=*/1, id("body"), Action::goTo(2)}}, /*Reduce=*/{}, + /*Recovery=*/{{/*State=*/1, RecoveryStrategy::Braces, id("body")}}); + + clang::LangOptions LOptions; + // Innermost brace is unmatched, to test fallback to next brace. + TokenStream Tokens = cook(lex("{ { { ? ? } }", LOptions), LOptions); + Tokens.tokens()[0].Pair = 5; + Tokens.tokens()[1].Pair = 4; + Tokens.tokens()[4].Pair = 1; + Tokens.tokens()[5].Pair = 0; + + auto *Brace1 = &Arena.createTerminal(tok::l_brace, 0); + auto *Brace2 = &Arena.createTerminal(tok::l_brace, 1); + auto *Brace3 = &Arena.createTerminal(tok::l_brace, 2); + const auto *Root = GSStack.addNode(0, nullptr, {}); + const auto *In1 = GSStack.addNode(1, Brace1, {Root}); + const auto *In2 = GSStack.addNode(1, Brace2, {In1}); + const auto *In3 = GSStack.addNode(1, Brace3, {In2}); + + unsigned TokenIndex = 3; + std::vector NewHeads; + glrRecover({In3}, TokenIndex, Tokens, {G, Table, Arena, GSStack}, NewHeads); + EXPECT_EQ(TokenIndex, 5u); + EXPECT_THAT(NewHeads, ElementsAre(AllOf(state(2), parsedSymbolID(id("body")), + parents({In2}), start(2u)))); +} + +TEST_F(GLRTest, RecoverAlternatives) { + // Recovery inside braces with multiple equally good options + // Before: + // 0--1({) + // After recovering either `word` or `number` inside the braces: + // 0--1({)--2(word) // 2 is goto(1, word) + // └--3(number) // 3 is goto(1, number) + buildGrammar({"number", "word"}, {}); + LRTable Table = LRTable::buildForTests( + G, + { + {/*State=*/1, id("number"), Action::goTo(2)}, + {/*State=*/1, id("word"), Action::goTo(3)}, + }, + /*Reduce=*/{}, + /*Recovery=*/ + { + {/*State=*/1, RecoveryStrategy::Braces, id("number")}, + {/*State=*/1, RecoveryStrategy::Braces, id("word")}, + }); + auto *LBrace = &Arena.createTerminal(tok::l_brace, 0); + const auto *Root = GSStack.addNode(0, nullptr, {}); + const auto *OpenedBraces = GSStack.addNode(1, LBrace, {Root}); + + clang::LangOptions LOptions; + TokenStream Tokens = cook(lex("{ ? }", LOptions), LOptions); + pairBrackets(Tokens); + std::vector NewHeads; + unsigned TokenIndex = 1; + + glrRecover({OpenedBraces}, TokenIndex, Tokens, {G, Table, Arena, GSStack}, + NewHeads); + EXPECT_EQ(TokenIndex, 2u); + EXPECT_THAT(NewHeads, + UnorderedElementsAre(AllOf(state(2), parsedSymbolID(id("number")), + parents({OpenedBraces}), start(1u)), + AllOf(state(3), parsedSymbolID(id("word")), + parents({OpenedBraces}), start(1u)))); +} + TEST_F(GLRTest, PerfectForestNodeSharing) { // Run the GLR on a simple grammar and test that we build exactly one forest // node per (SymbolID, token range). @@ -431,6 +552,40 @@ TEST_F(GLRTest, GLRReduceOrder) { "[ 0, end) └─IDENTIFIER := tok[0]\n"); } +TEST_F(GLRTest, RecoveryEndToEnd) { + // Simple example of brace-based recovery showing: + // - recovered region includes tokens both ahead of and behind the cursor + // - multiple possible recovery rules + // - recovery from outer scopes is rejected + build(R"bnf( + _ := block + + block := { block } + block := { numbers } + numbers := NUMERIC_CONSTANT NUMERIC_CONSTANT + )bnf"); + auto LRTable = LRTable::buildSLR(G); + clang::LangOptions LOptions; + TokenStream Tokens = cook(lex("{ { 42 ? } }", LOptions), LOptions); + pairBrackets(Tokens); + + const ForestNode &Parsed = + glrParse(Tokens, {G, LRTable, Arena, GSStack}, id("block")); + EXPECT_EQ(Parsed.dumpRecursive(G), + "[ 0, end) block := { block [recover=1] }\n" + "[ 0, 1) ├─{ := tok[0]\n" + "[ 1, 5) ├─block := \n" + "[ 1, 5) │ ├─block := { block [recover=1] }\n" + "[ 1, 2) │ │ ├─{ := tok[1]\n" + "[ 2, 4) │ │ ├─block := \n" + "[ 4, 5) │ │ └─} := tok[4]\n" + "[ 1, 5) │ └─block := { numbers [recover=1] }\n" + "[ 1, 2) │ ├─{ := tok[1]\n" + "[ 2, 4) │ ├─numbers := \n" + "[ 4, 5) │ └─} := tok[4]\n" + "[ 5, end) └─} := tok[5]\n"); +} + TEST_F(GLRTest, NoExplicitAccept) { build(R"bnf( _ := test -- 2.7.4