[pseudo] Split greatergreater token.

author Haojian Wu <hokein.wu@gmail.com>

Thu, 17 Mar 2022 12:42:31 +0000 (13:42 +0100)

committer Haojian Wu <hokein.wu@gmail.com>

Thu, 17 Mar 2022 12:46:58 +0000 (13:46 +0100)
author Haojian Wu <hokein.wu@gmail.com>
Thu, 17 Mar 2022 12:42:31 +0000 (13:42 +0100)
committer Haojian Wu <hokein.wu@gmail.com>
Thu, 17 Mar 2022 12:46:58 +0000 (13:46 +0100)
diff --git a/clang-tools-extra/pseudo/include/clang-pseudo/Token.h b/clang-tools-extra/pseudo/include/clang-pseudo/Token.h

index 24b6729..4563477 100644 (file)
--- a/clang-tools-extra/pseudo/include/clang-pseudo/Token.h
+++ b/clang-tools-extra/pseudo/include/clang-pseudo/Token.h
@@ -180,7 +180,8 @@ enum class LexFlags : uint8_t {
    NeedsCleaning = 1 << 1,
  };
  
-/// Derives a token stream by decoding escapes and interpreting raw_identifiers.
+/// Derives a token stream by decoding escapes, interpreting raw_identifiers and
+/// splitting the greatergreater token.
  ///
  /// Tokens containing UCNs, escaped newlines, trigraphs etc are decoded and
  /// their backing data is owned by the returned stream.
diff --git a/clang-tools-extra/pseudo/lib/Lex.cpp b/clang-tools-extra/pseudo/lib/Lex.cpp

index f5a2395..e99bf3a 100644 (file)
--- a/clang-tools-extra/pseudo/lib/Lex.cpp
+++ b/clang-tools-extra/pseudo/lib/Lex.cpp
@@ -98,9 +98,21 @@ TokenStream cook(const TokenStream &Code, const LangOptions &LangOpts) {
        Tok.Length = Text.size();
        Tok.Flags &= ~static_cast<decltype(Tok.Flags)>(LexFlags::NeedsCleaning);
      }
-    // Cook raw_identifiers into identifier, keyword, etc.
-    if (Tok.Kind == tok::raw_identifier)
+
+    if (Tok.Kind == tok::raw_identifier) {
+      // Cook raw_identifiers into identifier, keyword, etc.
        Tok.Kind = Identifiers.get(Tok.text()).getTokenID();
+    } else if (Tok.Kind == tok::greatergreater) {
+      // Split the greatergreater token.
+      // FIXME: split lessless token to support Cuda triple angle brackets <<<.
+      assert(Tok.text() == ">>");
+      Tok.Kind = tok::greater;
+      Tok.Length = 1;
+      Result.push(Tok);
+      // Line is wrong if the first greater is followed by an escaped newline!
+      Tok.Data = Tok.text().data() + 1;
+    }
+
      Result.push(std::move(Tok));
    }
  
diff --git a/clang-tools-extra/pseudo/lib/cxx.bnf b/clang-tools-extra/pseudo/lib/cxx.bnf

index 2619ce1..45b2f61 100644 (file)
--- a/clang-tools-extra/pseudo/lib/cxx.bnf
+++ b/clang-tools-extra/pseudo/lib/cxx.bnf
@@ -13,6 +13,9 @@
  #  - the file merely describes the core C++ grammar. Preprocessor directives and
  #    lexical conversions are omitted as we reuse clang's lexer and run a fake
  #    preprocessor;
+#  - grammar rules with the >> token are adjusted, the greatergreater token is
+#    split into two > tokens, to make the GLR parser aware of nested templates
+#    and right shift operator;
  #
  # Guidelines:
  #   - non-terminals are lower_case; terminals (aka tokens) correspond to
@@ -96,7 +99,7 @@ fold-operator := %
  fold-operator := ^
  fold-operator := |
  fold-operator := <<
-fold-operator := >>
+fold-operator := greatergreater
  fold-operator := +=
  fold-operator := -=
  fold-operator := *=
@@ -202,7 +205,7 @@ additive-expression := additive-expression - multiplicative-expression
  # expr.shift
  shift-expression := additive-expression
  shift-expression := shift-expression << additive-expression
-shift-expression := shift-expression >> additive-expression
+shift-expression := shift-expression greatergreater additive-expression
  # expr.spaceship
  compare-expression := shift-expression
  compare-expression := compare-expression <=> shift-expression
@@ -615,7 +618,7 @@ operator-name := <=>
  operator-name := ^^
  operator-name := ||
  operator-name := <<
-operator-name := >>
+operator-name := greatergreater
  operator-name := <<=
  operator-name := >>=
  operator-name := ++
@@ -737,3 +740,8 @@ contextual-zero := NUMERIC_CONSTANT
  module-keyword := IDENTIFIER
  import-keyword := IDENTIFIER
  export-keyword := IDENTIFIER
+
+#! greatergreater token -- clang lexer always lexes it as a single token, we
+#! split it into two tokens to make the GLR parser aware of the nested-template
+#! case.
+greatergreater := > >
diff --git a/clang-tools-extra/pseudo/unittests/TokenTest.cpp b/clang-tools-extra/pseudo/unittests/TokenTest.cpp

index 1357d23..b17f8c9 100644 (file)
--- a/clang-tools-extra/pseudo/unittests/TokenTest.cpp
+++ b/clang-tools-extra/pseudo/unittests/TokenTest.cpp
@@ -171,6 +171,25 @@ no_indent \
                              }));
  }
  
+TEST(TokenTest, SplitGreaterGreater) {
+  LangOptions Opts;
+  std::string Code = R"cpp(
+>> // split
+// >> with an escaped newline in the middle, split
+>\
+>
+>>= // not split
+)cpp";
+  TokenStream Split = stripComments(cook(lex(Code, Opts), Opts));
+  EXPECT_THAT(Split.tokens(), ElementsAreArray({
+                                  token(">", tok::greater),
+                                  token(">", tok::greater),
+                                  token(">", tok::greater),
+                                  token(">", tok::greater),
+                                  token(">>=", tok::greatergreaterequal),
+                              }));
+}
+
  TEST(TokenTest, DropComments) {
    LangOptions Opts;
    std::string Code = R"cpp(
author	Haojian Wu <hokein.wu@gmail.com>
	Thu, 17 Mar 2022 12:42:31 +0000 (13:42 +0100)
committer	Haojian Wu <hokein.wu@gmail.com>
	Thu, 17 Mar 2022 12:46:58 +0000 (13:46 +0100)
clang-tools-extra/pseudo/include/clang-pseudo/Token.h		patch \| blob \| history
clang-tools-extra/pseudo/lib/Lex.cpp		patch \| blob \| history
clang-tools-extra/pseudo/lib/cxx.bnf		patch \| blob \| history
clang-tools-extra/pseudo/unittests/TokenTest.cpp		patch \| blob \| history