[pseudo] Introduce parse forest.
authorHaojian Wu <hokein.wu@gmail.com>
Mon, 21 Mar 2022 14:32:15 +0000 (15:32 +0100)
committerHaojian Wu <hokein.wu@gmail.com>
Thu, 24 Mar 2022 13:47:17 +0000 (14:47 +0100)
Parse forest is the output of the GLR parser, it is a tree-like DAG
which presents all possible parse trees without duplicating subparse structures.

This is a patch split from https://reviews.llvm.org/D121150.

Differential Revision: https://reviews.llvm.org/D122139

clang-tools-extra/pseudo/include/clang-pseudo/Forest.h [new file with mode: 0644]
clang-tools-extra/pseudo/lib/CMakeLists.txt
clang-tools-extra/pseudo/lib/Forest.cpp [new file with mode: 0644]
clang-tools-extra/pseudo/unittests/CMakeLists.txt
clang-tools-extra/pseudo/unittests/ForestTest.cpp [new file with mode: 0644]

diff --git a/clang-tools-extra/pseudo/include/clang-pseudo/Forest.h b/clang-tools-extra/pseudo/include/clang-pseudo/Forest.h
new file mode 100644 (file)
index 0000000..650f913
--- /dev/null
@@ -0,0 +1,178 @@
+//===--- Forest.h - Parse forest, the output of the GLR parser ---*- C++-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// A parse forest represents a set of possible parse trees efficiently, it is
+// produced by the GLR parser.
+//
+// Despite the name, its data structure is a tree-like DAG with a single root.
+// Multiple ways to parse the same tokens are presented as an ambiguous node
+// with all possible interpretations as children.
+// Common sub-parses are shared: if two interpretations both parse "1 + 1" as
+// "expr := expr + expr", they will share a Sequence node representing the expr.
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang-pseudo/Grammar.h"
+#include "clang-pseudo/Token.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/Allocator.h"
+#include <cstdint>
+
+namespace clang {
+namespace pseudo {
+
+// A node represents ways to parse a sequence of tokens, it interprets a fixed
+// range of tokens as a fixed grammar symbol.
+//
+// There are different kinds of nodes, some nodes have "children" (stored in a
+// trailing array) and have pointers to them. "Children" has different semantics
+// depending on the node kinds. For an Ambiguous node, it means all
+// possible interpretations; for a Sequence node, it means each symbol on the
+// right hand side of the production rule.
+//
+// Since this is a node in a DAG, a node may have multiple parents. And a node
+// doesn't have parent pointers.
+class alignas(class ForestNode *) ForestNode {
+public:
+  enum Kind : uint8_t {
+    // A Terminal node is a single terminal symbol bound to a token.
+    Terminal,
+    // A Sequence node is a nonterminal symbol parsed from a grammar rule,
+    // elements() are the parses of each symbol on the RHS of the rule.
+    // If the rule is A := X Y Z, the node is for nonterminal A, and elements()
+    // are [X, Y, Z].
+    Sequence,
+    // An Ambiguous node exposes multiple ways to interpret the code as the
+    // same symbol, alternatives() are all possible parses.
+    Ambiguous,
+    // An Opaque node is a placeholder. It asserts that tokens match a symbol,
+    // without saying how.
+    // It is used for lazy-parsing (not parsed yet), or error-recovery (invalid
+    // code).
+    Opaque,
+  };
+  Kind kind() const { return K; }
+
+  SymbolID symbol() const { return Symbol; }
+
+  // The start of the token range, it is a poistion within a token stream.
+  Token::Index startTokenIndex() const { return StartIndex; }
+
+  // Returns the corresponding grammar rule.
+  // REQUIRES: this is a Sequence node.
+  RuleID rule() const {
+    assert(kind() == Sequence);
+    return Data & ((1 << RuleBits) - 1);
+  }
+  // Returns the parses of each element on the RHS of the rule.
+  // REQUIRES: this is a Sequence node;
+  llvm::ArrayRef<const ForestNode *> elements() const {
+    assert(kind() == Sequence);
+    return children(Data >> RuleBits);
+  };
+
+  // Returns all possible interpretations of the code.
+  // REQUIRES: this is an Ambiguous node.
+  llvm::ArrayRef<const ForestNode *> alternatives() const {
+    assert(kind() == Ambiguous);
+    return children(Data);
+  }
+
+  std::string dump(const Grammar &) const;
+  std::string dumpRecursive(const Grammar &, bool Abbreviated = false) const;
+
+private:
+  friend class ForestArena;
+
+  ForestNode(Kind K, SymbolID Symbol, Token::Index StartIndex, uint16_t Data)
+      : StartIndex(StartIndex), K(K), Symbol(Symbol), Data(Data) {}
+
+  ForestNode(const ForestNode &) = delete;
+  ForestNode &operator=(const ForestNode &) = delete;
+  ForestNode(ForestNode &&) = delete;
+  ForestNode &operator=(ForestNode &&) = delete;
+
+  static uint16_t sequenceData(RuleID Rule,
+                               llvm::ArrayRef<const ForestNode *> Elements) {
+    assert(Rule < (1 << RuleBits));
+    assert(Elements.size() < (1 << (16 - RuleBits)));
+    return Rule | Elements.size() << RuleBits;
+  }
+  static uint16_t
+  ambiguousData(llvm::ArrayRef<const ForestNode *> Alternatives) {
+    return Alternatives.size();
+  }
+
+  // Retrieves the trailing array.
+  llvm::ArrayRef<const ForestNode *> children(uint16_t Num) const {
+    return llvm::makeArrayRef(reinterpret_cast<ForestNode *const *>(this + 1),
+                              Num);
+  }
+
+  Token::Index StartIndex;
+  Kind K : 4;
+  SymbolID Symbol : SymbolBits;
+  // Sequence - child count : 4 | RuleID : RuleBits (12)
+  // Ambiguous - child count : 16
+  // Terminal, Opaque - unused
+  uint16_t Data;
+  // An array of ForestNode* following the object.
+};
+// ForestNode may not be destroyed (for BumpPtrAllocator).
+static_assert(std::is_trivially_destructible<ForestNode>(), "");
+
+// A memory arena for the parse forest.
+class ForestArena {
+public:
+  llvm::ArrayRef<ForestNode> createTerminals(const TokenStream &Code);
+  ForestNode &createSequence(SymbolID SID, RuleID RID,
+                             llvm::ArrayRef<const ForestNode *> Elements) {
+    assert(!Elements.empty());
+    return create(ForestNode::Sequence, SID,
+                  Elements.front()->startTokenIndex(),
+                  ForestNode::sequenceData(RID, Elements), Elements);
+  }
+  ForestNode &createAmbiguous(SymbolID SID,
+                              llvm::ArrayRef<const ForestNode *> Alternatives) {
+    assert(!Alternatives.empty());
+    assert(llvm::all_of(Alternatives,
+                        [SID](const ForestNode *Alternative) {
+                          return SID == Alternative->symbol();
+                        }) &&
+           "Ambiguous alternatives must represent the same symbol!");
+    return create(ForestNode::Ambiguous, SID,
+                  Alternatives.front()->startTokenIndex(),
+                  ForestNode::ambiguousData(Alternatives), Alternatives);
+  }
+  ForestNode &createOpaque(SymbolID SID, Token::Index Start) {
+    return create(ForestNode::Opaque, SID, Start, 0, {});
+  }
+
+  size_t nodeCount() const { return NodeCount; }
+  size_t bytes() const { return Arena.getBytesAllocated() + sizeof(this); }
+
+private:
+  ForestNode &create(ForestNode::Kind K, SymbolID SID, Token::Index Start,
+                     uint16_t Data,
+                     llvm::ArrayRef<const ForestNode *> Elements) {
+    ++NodeCount;
+    ForestNode *New = new (Arena.Allocate(
+        sizeof(ForestNode) + Elements.size() * sizeof(ForestNode *),
+        alignof(ForestNode))) ForestNode(K, SID, Start, Data);
+    if (!Elements.empty())
+      llvm::copy(Elements, reinterpret_cast<const ForestNode **>(New + 1));
+    return *New;
+  }
+
+  llvm::BumpPtrAllocator Arena;
+  uint32_t NodeCount = 0;
+};
+
+} // namespace pseudo
+} // namespace clang
index 7239a58..e230938 100644 (file)
@@ -2,6 +2,7 @@ set(LLVM_LINK_COMPONENTS Support)
 
 add_clang_library(clangPseudo
   DirectiveMap.cpp
+  Forest.cpp
   Grammar.cpp
   GrammarBNF.cpp
   Lex.cpp
diff --git a/clang-tools-extra/pseudo/lib/Forest.cpp b/clang-tools-extra/pseudo/lib/Forest.cpp
new file mode 100644 (file)
index 0000000..41b9be8
--- /dev/null
@@ -0,0 +1,126 @@
+//===--- Forest.cpp - Parse forest  ------------------------------*- C++-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang-pseudo/Forest.h"
+#include "clang-pseudo/Token.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/None.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/FormatVariadic.h"
+
+namespace clang {
+namespace pseudo {
+
+std::string ForestNode::dump(const Grammar &G) const {
+  switch (kind()) {
+  case Ambiguous:
+    return llvm::formatv("{0} := <ambiguous>", G.symbolName(symbol()));
+  case Terminal:
+    return llvm::formatv("{0} := tok[{1}]", G.symbolName(symbol()),
+                         startTokenIndex());
+  case Sequence:
+    return G.dumpRule(rule());
+  case Opaque:
+    return llvm::formatv("{0} := <opaque>", G.symbolName(symbol()));
+  }
+  llvm_unreachable("Unhandled node kind!");
+}
+
+std::string ForestNode::dumpRecursive(const Grammar &G,
+                                      bool Abbreviated) const {
+  // Count visits of nodes so we can mark those seen multiple times.
+  llvm::DenseMap<const ForestNode *, /*VisitCount*/ unsigned> VisitCounts;
+  std::function<void(const ForestNode *)> CountVisits =
+      [&](const ForestNode *P) {
+        if (VisitCounts[P]++ > 0)
+          return; // Don't count children as multiply visited.
+        if (P->kind() == Ambiguous)
+          llvm::for_each(P->alternatives(), CountVisits);
+        else if (P->kind() == Sequence)
+          llvm::for_each(P->elements(), CountVisits);
+      };
+  CountVisits(this);
+
+  // We print a "#<id>" for nonterminal forest nodes that are being dumped
+  // multiple times.
+  llvm::DenseMap<const ForestNode *, size_t> ReferenceIds;
+  std::string Result;
+  constexpr Token::Index KEnd = std::numeric_limits<Token::Index>::max();
+  std::function<void(const ForestNode *, unsigned, Token::Index,
+                     llvm::Optional<SymbolID>)>
+      Dump = [&](const ForestNode *P, unsigned Level, Token::Index End,
+                 llvm::Optional<SymbolID> ElidedParent) {
+        llvm::ArrayRef<const ForestNode *> Children;
+        auto EndOfElement = [&](size_t ChildIndex) {
+          return ChildIndex + 1 == Children.size()
+                     ? End
+                     : Children[ChildIndex + 1]->startTokenIndex();
+        };
+        if (P->kind() == Ambiguous) {
+          Children = P->alternatives();
+        } else if (P->kind() == Sequence) {
+          Children = P->elements();
+          if (Abbreviated) {
+            if (P->startTokenIndex() == End)
+              return;
+            for (size_t I = 0; I < Children.size(); ++I)
+              if (Children[I]->startTokenIndex() == P->startTokenIndex() &&
+                  EndOfElement(I) == End) {
+                return Dump(
+                    Children[I], Level, End,
+                    /*ElidedParent=*/ElidedParent.getValueOr(P->symbol()));
+              }
+          }
+        }
+
+        // FIXME: pretty ascii trees
+        if (End == KEnd)
+          Result += llvm::formatv("[{0,3}, end) ", P->startTokenIndex());
+        else
+          Result += llvm::formatv("[{0,3}, {1,3}) ", P->startTokenIndex(), End);
+        Result.append(2 * Level, ' ');
+        if (ElidedParent.hasValue()) {
+          Result += G.symbolName(*ElidedParent);
+          Result += "~";
+        }
+        Result.append(P->dump(G));
+
+        if (VisitCounts.find(P)->getSecond() > 1 &&
+            P->kind() != ForestNode::Terminal) {
+          // The first time, print as #1. Later, =#1.
+          auto It = ReferenceIds.try_emplace(P, ReferenceIds.size() + 1);
+          Result +=
+              llvm::formatv(" {0}#{1}", It.second ? "" : "=", It.first->second);
+        }
+        Result.push_back('\n');
+
+        ++Level;
+        for (size_t I = 0; I < Children.size(); ++I)
+          Dump(Children[I], Level,
+               P->kind() == Sequence ? EndOfElement(I) : End, llvm::None);
+      };
+  Dump(this, 0, KEnd, llvm::None);
+  return Result;
+}
+
+llvm::ArrayRef<ForestNode>
+ForestArena::createTerminals(const TokenStream &Code) {
+  ForestNode *Terminals = Arena.Allocate<ForestNode>(Code.tokens().size());
+  size_t Index = 0;
+  for (const auto &T : Code.tokens()) {
+    new (&Terminals[Index])
+        ForestNode(ForestNode::Terminal, tokenSymbol(T.Kind),
+                   /*Start=*/Index, /*TerminalData*/ 0);
+    ++Index;
+  }
+  NodeCount = Index;
+  return llvm::makeArrayRef(Terminals, Index);
+}
+
+} // namespace pseudo
+} // namespace clang
index e4919f8..7b06e5b 100644 (file)
@@ -5,6 +5,7 @@ set(LLVM_LINK_COMPONENTS
 add_custom_target(ClangPseudoUnitTests)
 add_unittest(ClangPseudoUnitTests ClangPseudoTests
   DirectiveMapTest.cpp
+  ForestTest.cpp
   GrammarTest.cpp
   LRTableTest.cpp
   TokenTest.cpp
diff --git a/clang-tools-extra/pseudo/unittests/ForestTest.cpp b/clang-tools-extra/pseudo/unittests/ForestTest.cpp
new file mode 100644 (file)
index 0000000..b9bd08d
--- /dev/null
@@ -0,0 +1,130 @@
+//===--- ForestTest.cpp - Test Forest dump ----------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang-pseudo/Forest.h"
+#include "clang-pseudo/Token.h"
+#include "clang/Basic/LangOptions.h"
+#include "llvm/ADT/StringRef.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include <vector>
+
+namespace clang {
+namespace pseudo {
+namespace {
+
+// FIXME: extract to a TestGrammar class to allow code sharing among tests.
+class ForestTest : public ::testing::Test {
+public:
+  void build(llvm::StringRef BNF) {
+    Diags.clear();
+    G = Grammar::parseBNF(BNF, Diags);
+  }
+
+  SymbolID symbol(llvm::StringRef Name) const {
+    for (unsigned I = 0; I < NumTerminals; ++I)
+      if (G->table().Terminals[I] == Name)
+        return tokenSymbol(static_cast<tok::TokenKind>(I));
+    for (SymbolID ID = 0; ID < G->table().Nonterminals.size(); ++ID)
+      if (G->table().Nonterminals[ID].Name == Name)
+        return ID;
+    ADD_FAILURE() << "No such symbol found: " << Name;
+    return 0;
+  }
+
+  RuleID ruleFor(llvm::StringRef NonterminalName) const {
+    auto RuleRange = G->table().Nonterminals[symbol(NonterminalName)].RuleRange;
+    if (RuleRange.End - RuleRange.Start == 1)
+      return G->table().Nonterminals[symbol(NonterminalName)].RuleRange.Start;
+    ADD_FAILURE() << "Expected a single rule for " << NonterminalName
+                  << ", but it has " << RuleRange.End - RuleRange.Start
+                  << " rule!\n";
+    return 0;
+  }
+
+protected:
+  std::unique_ptr<Grammar> G;
+  std::vector<std::string> Diags;
+};
+
+TEST_F(ForestTest, DumpBasic) {
+  build(R"cpp(
+    _ := add-expression
+    add-expression := id-expression + id-expression
+    id-expression := IDENTIFIER
+  )cpp");
+  ASSERT_TRUE(Diags.empty());
+  ForestArena Arena;
+  const auto &TS =
+      cook(lex("a + b", clang::LangOptions()), clang::LangOptions());
+
+  auto T = Arena.createTerminals(TS);
+  ASSERT_EQ(T.size(), 3u);
+  const auto *Left = &Arena.createSequence(
+      symbol("id-expression"), ruleFor("id-expression"), {&T.front()});
+  const auto *Right = &Arena.createSequence(symbol("id-expression"),
+                                            ruleFor("id-expression"), {&T[2]});
+
+  const auto *Add =
+      &Arena.createSequence(symbol("add-expression"), ruleFor("add-expression"), {Left, &T[1], Right});
+  EXPECT_EQ(Add->dumpRecursive(*G, true),
+            "[  0, end) add-expression := id-expression + id-expression\n"
+            "[  0,   1)   id-expression~IDENTIFIER := tok[0]\n"
+            "[  1,   2)   + := tok[1]\n"
+            "[  2, end)   id-expression~IDENTIFIER := tok[2]\n");
+  EXPECT_EQ(Add->dumpRecursive(*G, false),
+            "[  0, end) add-expression := id-expression + id-expression\n"
+            "[  0,   1)   id-expression := IDENTIFIER\n"
+            "[  0,   1)     IDENTIFIER := tok[0]\n"
+            "[  1,   2)   + := tok[1]\n"
+            "[  2, end)   id-expression := IDENTIFIER\n"
+            "[  2, end)     IDENTIFIER := tok[2]\n");
+}
+
+TEST_F(ForestTest, DumpAmbiguousAndRefs) {
+  build(R"cpp(
+    _ := type
+    type := class-type # rule 1
+    type := enum-type # rule 2
+    class-type := shared-type
+    enum-type := shared-type
+    shared-type := IDENTIFIER)cpp");
+  ASSERT_TRUE(Diags.empty());
+  ForestArena Arena;
+  const auto &TS = cook(lex("abc", clang::LangOptions()), clang::LangOptions());
+
+  auto Terminals = Arena.createTerminals(TS);
+  ASSERT_EQ(Terminals.size(), 1u);
+
+  const auto *SharedType = &Arena.createSequence(
+      symbol("shared-type"), ruleFor("shared-type"), {Terminals.begin()});
+  const auto *ClassType = &Arena.createSequence(
+      symbol("class-type"), ruleFor("class-type"), {SharedType});
+  const auto *Enumtype = &Arena.createSequence(
+      symbol("enum-type"), ruleFor("enum-type"), {SharedType});
+  const auto *Alternative1 =
+      &Arena.createSequence(symbol("type"), /*RuleID=*/1, {ClassType});
+  const auto *Alternative2 =
+      &Arena.createSequence(symbol("type"), /*RuleID=*/2, {Enumtype});
+  const auto *Type =
+      &Arena.createAmbiguous(symbol("type"), {Alternative1, Alternative2});
+  EXPECT_EQ(Type->dumpRecursive(*G),
+            "[  0, end) type := <ambiguous>\n"
+            "[  0, end)   class-type := shared-type\n"
+            "[  0, end)     class-type := shared-type\n"
+            "[  0, end)       shared-type := IDENTIFIER #1\n"
+            "[  0, end)         IDENTIFIER := tok[0]\n"
+            "[  0, end)   enum-type := shared-type\n"
+            "[  0, end)     enum-type := shared-type\n"
+            "[  0, end)       shared-type := IDENTIFIER =#1\n"
+            "[  0, end)         IDENTIFIER := tok[0]\n");
+}
+
+} // namespace
+} // namespace pseudo
+} // namespace clang