[Symbolize] Parser for log symbolizer markup.

author Daniel Thornburgh <dthorn@google.com>

Thu, 7 Apr 2022 23:37:11 +0000 (23:37 +0000)

committer Daniel Thornburgh <dthorn@google.com>

Fri, 17 Jun 2022 17:26:24 +0000 (10:26 -0700)
author Daniel Thornburgh <dthorn@google.com>
Thu, 7 Apr 2022 23:37:11 +0000 (23:37 +0000)
committer Daniel Thornburgh <dthorn@google.com>
Fri, 17 Jun 2022 17:26:24 +0000 (10:26 -0700)
diff --git a/llvm/include/llvm/DebugInfo/Symbolize/Markup.h b/llvm/include/llvm/DebugInfo/Symbolize/Markup.h

new file mode 100644 (file)

index 0000000..19cc0ab
--- /dev/null
+++ b/llvm/include/llvm/DebugInfo/Symbolize/Markup.h
@@ -0,0 +1,99 @@
+//===- Markup.h -------------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file declares the log symbolizer markup data model and parser.
+///
+/// \todo Add a link to the reference documentation once added.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DEBUGINFO_SYMBOLIZE_MARKUP_H
+#define LLVM_DEBUGINFO_SYMBOLIZE_MARKUP_H
+
+#include <iostream>
+
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Regex.h"
+
+namespace llvm {
+namespace symbolize {
+
+/// A node of symbolizer markup.
+///
+/// If only the Text field is set, this represents a region of text outside a
+/// markup element. ANSI SGR control codes are also reported this way; if
+/// detected, then the control code will be the entirety of the Text field, and
+/// any surrounding text will be reported as preceding and following nodes.
+struct MarkupNode {
+  /// The full text of this node in the input.
+  StringRef Text;
+
+  /// If this represents an element, the tag. Otherwise, empty.
+  StringRef Tag;
+
+  /// If this represents an element with fields, a list of the field contents.
+  /// Otherwise, empty.
+  SmallVector<StringRef> Fields;
+
+  bool operator==(const MarkupNode &Other) const {
+    return Text == Other.Text && Tag == Other.Tag && Fields == Other.Fields;
+  }
+  bool operator!=(const MarkupNode &Other) const { return !(*this == Other); }
+};
+
+/// Parses a log containing symbolizer markup into a sequence of nodes.
+class MarkupParser {
+public:
+  MarkupParser();
+
+  /// Parses an individual \p Line of input.
+  ///
+  /// Nodes from the previous parseLine() call that haven't yet been extracted
+  /// by nextNode() are discarded. The nodes returned by nextNode() may
+  /// reference the input string, so it must be retained by the caller until the
+  /// last use.
+  void parseLine(StringRef Line);
+
+  /// Returns the next node from the most recent parseLine() call.
+  ///
+  /// Calling nextNode() may invalidate the contents of the node returned by the
+  /// previous call.
+  ///
+  /// \returns the next markup node or None if none remain.
+  Optional<MarkupNode> nextNode() {
+    if (!NextIdx)
+      NextIdx = 0;
+    if (*NextIdx == Buffer.size()) {
+      NextIdx.reset();
+      Buffer.clear();
+      return None;
+    }
+    return std::move(Buffer[(*NextIdx)++]);
+  }
+
+private:
+  Optional<MarkupNode> parseElement(StringRef Line);
+  void parseTextOutsideMarkup(StringRef Text);
+
+  // Buffer for nodes parsed from the current line.
+  SmallVector<MarkupNode> Buffer;
+
+  // Next buffer index to return or None if nextNode has not yet been called.
+  Optional<size_t> NextIdx;
+
+  // Regular expression matching supported ANSI SGR escape sequences.
+  const Regex SGRSyntax;
+};
+
+} // end namespace symbolize
+} // end namespace llvm
+
+#endif // LLVM_DEBUGINFO_SYMBOLIZE_MARKUP_H
diff --git a/llvm/lib/DebugInfo/Symbolize/CMakeLists.txt b/llvm/lib/DebugInfo/Symbolize/CMakeLists.txt

index a647a16..c83d957 100644 (file)
--- a/llvm/lib/DebugInfo/Symbolize/CMakeLists.txt
+++ b/llvm/lib/DebugInfo/Symbolize/CMakeLists.txt
@@ -1,6 +1,7 @@
  add_llvm_component_library(LLVMSymbolize
    DIFetcher.cpp
    DIPrinter.cpp
+  Markup.cpp
    SymbolizableObjectFile.cpp
    Symbolize.cpp
  
diff --git a/llvm/lib/DebugInfo/Symbolize/Markup.cpp b/llvm/lib/DebugInfo/Symbolize/Markup.cpp

new file mode 100644 (file)

index 0000000..04cf7b3
--- /dev/null
+++ b/llvm/lib/DebugInfo/Symbolize/Markup.cpp
@@ -0,0 +1,111 @@
+//===- lib/DebugInfo/Symbolize/Markup.cpp ------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines the log symbolizer markup data model and parser.
+///
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/Symbolize/Markup.h"
+
+#include "llvm/ADT/StringExtras.h"
+
+namespace llvm {
+namespace symbolize {
+
+// Matches the following:
+//   "\033[0m"
+//   "\033[1m"
+//   "\033[30m" -- "\033[37m"
+static const char SGRSyntaxStr[] = "\033\\[([0-1]|3[0-7])m";
+
+MarkupParser::MarkupParser() : SGRSyntax(SGRSyntaxStr) {}
+
+static StringRef takeTo(StringRef Str, StringRef::iterator Pos) {
+  return Str.take_front(Pos - Str.begin());
+}
+static void advanceTo(StringRef &Str, StringRef::iterator Pos) {
+  Str = Str.drop_front(Pos - Str.begin());
+}
+
+void MarkupParser::parseLine(StringRef Line) {
+  Buffer.clear();
+  while (!Line.empty()) {
+    // Find the first valid markup element, if any.
+    if (Optional<MarkupNode> Element = parseElement(Line)) {
+      parseTextOutsideMarkup(takeTo(Line, Element->Text.begin()));
+      Buffer.push_back(std::move(*Element));
+      advanceTo(Line, Element->Text.end());
+    } else {
+      // The line doesn't contain any more markup elements, so emit it as text.
+      parseTextOutsideMarkup(Line);
+      return;
+    }
+  }
+}
+
+// Finds and returns the next valid markup element in the given line. Returns
+// None if the line contains no valid elements.
+Optional<MarkupNode> MarkupParser::parseElement(StringRef Line) {
+  while (true) {
+    // Find next element using begin and end markers.
+    size_t BeginPos = Line.find("{{{");
+    if (BeginPos == StringRef::npos)
+      return None;
+    size_t EndPos = Line.find("}}}", BeginPos + 3);
+    if (EndPos == StringRef::npos)
+      return None;
+    EndPos += 3;
+    MarkupNode Element;
+    Element.Text = Line.slice(BeginPos, EndPos);
+    Line = Line.substr(EndPos);
+
+    // Parse tag.
+    StringRef Content = Element.Text.drop_front(3).drop_back(3);
+    StringRef FieldsContent;
+    std::tie(Element.Tag, FieldsContent) = Content.split(':');
+    if (Element.Tag.empty())
+      continue;
+
+    // Parse fields.
+    if (!FieldsContent.empty())
+      FieldsContent.split(Element.Fields, ":");
+    else if (Content.back() == ':')
+      Element.Fields.push_back(FieldsContent);
+
+    return Element;
+  }
+}
+
+static MarkupNode textNode(StringRef Text) {
+  MarkupNode Node;
+  Node.Text = Text;
+  return Node;
+}
+
+// Parses a region of text known to be outside any markup elements. Such text
+// may still contain SGR control codes, so the region is further subdivided into
+// control codes and true text regions.
+void MarkupParser::parseTextOutsideMarkup(StringRef Text) {
+  if (Text.empty())
+    return;
+  SmallVector<StringRef> Matches;
+  while (SGRSyntax.match(Text, &Matches)) {
+    // Emit any text before the SGR element.
+    if (Matches.begin()->begin() != Text.begin())
+      Buffer.push_back(textNode(takeTo(Text, Matches.begin()->begin())));
+
+    Buffer.push_back(textNode(*Matches.begin()));
+    advanceTo(Text, Matches.begin()->end());
+  }
+  if (!Text.empty())
+    Buffer.push_back(textNode(Text));
+}
+
+} // end namespace symbolize
+} // end namespace llvm
diff --git a/llvm/unittests/DebugInfo/CMakeLists.txt b/llvm/unittests/DebugInfo/CMakeLists.txt

index 0a0a114..4be8d76 100644 (file)
--- a/llvm/unittests/DebugInfo/CMakeLists.txt
+++ b/llvm/unittests/DebugInfo/CMakeLists.txt
@@ -3,3 +3,4 @@ add_subdirectory(DWARF)
  add_subdirectory(GSYM)
  add_subdirectory(MSF)
  add_subdirectory(PDB)
+add_subdirectory(Symbolizer)
diff --git a/llvm/unittests/DebugInfo/Symbolizer/CMakeLists.txt b/llvm/unittests/DebugInfo/Symbolizer/CMakeLists.txt

new file mode 100644 (file)

index 0000000..e6c2ba1
--- /dev/null
+++ b/llvm/unittests/DebugInfo/Symbolizer/CMakeLists.txt
@@ -0,0 +1,3 @@
+set(LLVM_LINK_COMPONENTS Symbolize)
+add_llvm_unittest(DebugInfoSymbolizerTests MarkupTest.cpp)
+target_link_libraries(DebugInfoSymbolizerTests PRIVATE LLVMTestingSupport)
diff --git a/llvm/unittests/DebugInfo/Symbolizer/MarkupTest.cpp b/llvm/unittests/DebugInfo/Symbolizer/MarkupTest.cpp

new file mode 100644 (file)

index 0000000..6d587d9
--- /dev/null
+++ b/llvm/unittests/DebugInfo/Symbolizer/MarkupTest.cpp
@@ -0,0 +1,148 @@
+
+//===- unittest/DebugInfo/Symbolizer/MarkupTest.cpp - Markup parser tests -===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/Symbolize/Markup.h"
+
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Support/FormatVariadic.h"
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+namespace {
+
+using namespace llvm;
+using namespace llvm::symbolize;
+using namespace testing;
+
+Matcher<MarkupNode> isNode(StringRef Text, StringRef Tag = "",
+                           Matcher<SmallVector<StringRef>> Fields = IsEmpty()) {
+  return AllOf(Field("Text", &MarkupNode::Text, Text),
+               Field("Tag", &MarkupNode::Tag, Tag),
+               Field("Fields", &MarkupNode::Fields, Fields));
+}
+
+TEST(SymbolizerMarkup, NoLines) { EXPECT_EQ(MarkupParser{}.nextNode(), None); }
+
+TEST(SymbolizerMarkup, LinesWithoutMarkup) {
+  MarkupParser Parser;
+
+  Parser.parseLine("text");
+  EXPECT_THAT(Parser.nextNode(), testing::Optional(isNode("text")));
+  EXPECT_THAT(Parser.nextNode(), None);
+
+  Parser.parseLine("discarded");
+  Parser.parseLine("kept");
+  EXPECT_THAT(Parser.nextNode(), testing::Optional(isNode("kept")));
+  EXPECT_THAT(Parser.nextNode(), None);
+
+  Parser.parseLine("{{{");
+  EXPECT_THAT(Parser.nextNode(), testing::Optional(isNode("{{{")));
+  EXPECT_THAT(Parser.nextNode(), None);
+
+  Parser.parseLine("{{{}}");
+  EXPECT_THAT(Parser.nextNode(), testing::Optional(isNode("{{{}}")));
+  EXPECT_THAT(Parser.nextNode(), None);
+
+  Parser.parseLine("{{}}}");
+  EXPECT_THAT(Parser.nextNode(), testing::Optional(isNode("{{}}}")));
+  EXPECT_THAT(Parser.nextNode(), None);
+
+  Parser.parseLine("{{{}}}");
+  EXPECT_THAT(Parser.nextNode(), testing::Optional(isNode("{{{}}}")));
+  EXPECT_THAT(Parser.nextNode(), None);
+
+  Parser.parseLine("{{{:field}}}");
+  EXPECT_THAT(Parser.nextNode(), testing::Optional(isNode("{{{:field}}}")));
+  EXPECT_THAT(Parser.nextNode(), None);
+
+  Parser.parseLine("{{{tag:");
+  EXPECT_THAT(Parser.nextNode(), testing::Optional(isNode("{{{tag:")));
+  EXPECT_THAT(Parser.nextNode(), None);
+
+  Parser.parseLine("{{{tag:field}}");
+  EXPECT_THAT(Parser.nextNode(), testing::Optional(isNode("{{{tag:field}}")));
+  EXPECT_THAT(Parser.nextNode(), None);
+
+  Parser.parseLine("a\033[2mb");
+  EXPECT_THAT(Parser.nextNode(), testing::Optional(isNode("a\033[2mb")));
+  EXPECT_THAT(Parser.nextNode(), None);
+
+  Parser.parseLine("a\033[38mb");
+  EXPECT_THAT(Parser.nextNode(), testing::Optional(isNode("a\033[38mb")));
+  EXPECT_THAT(Parser.nextNode(), None);
+
+  Parser.parseLine("a\033[4mb");
+  EXPECT_THAT(Parser.nextNode(), testing::Optional(isNode("a\033[4mb")));
+  EXPECT_THAT(Parser.nextNode(), None);
+}
+
+TEST(SymbolizerMarkup, LinesWithMarkup) {
+  MarkupParser Parser;
+
+  Parser.parseLine("{{{tag}}}");
+  EXPECT_THAT(Parser.nextNode(), testing::Optional(isNode("{{{tag}}}", "tag")));
+  EXPECT_THAT(Parser.nextNode(), None);
+
+  Parser.parseLine("{{{tag:f1:f2:f3}}}");
+  EXPECT_THAT(Parser.nextNode(),
+              testing::Optional(isNode("{{{tag:f1:f2:f3}}}", "tag",
+                                       ElementsAre("f1", "f2", "f3"))));
+  EXPECT_THAT(Parser.nextNode(), None);
+
+  Parser.parseLine("{{{tag:}}}");
+  EXPECT_THAT(Parser.nextNode(),
+              testing::Optional(isNode("{{{tag:}}}", "tag", ElementsAre(""))));
+  EXPECT_THAT(Parser.nextNode(), None);
+
+  Parser.parseLine("{{{tag:}}");
+  EXPECT_THAT(Parser.nextNode(), testing::Optional(isNode("{{{tag:}}")));
+  EXPECT_THAT(Parser.nextNode(), None);
+
+  Parser.parseLine("{{{t2g}}}");
+  EXPECT_THAT(Parser.nextNode(), testing::Optional(isNode("{{{t2g}}}", "t2g")));
+  EXPECT_THAT(Parser.nextNode(), None);
+
+  Parser.parseLine("{{{tAg}}}");
+  EXPECT_THAT(Parser.nextNode(), testing::Optional(isNode("{{{tAg}}}", "tAg")));
+  EXPECT_THAT(Parser.nextNode(), None);
+
+  Parser.parseLine("a{{{b}}}c{{{d}}}e");
+  EXPECT_THAT(Parser.nextNode(), testing::Optional(isNode("a")));
+  EXPECT_THAT(Parser.nextNode(), testing::Optional(isNode("{{{b}}}", "b")));
+  EXPECT_THAT(Parser.nextNode(), testing::Optional(isNode("c")));
+  EXPECT_THAT(Parser.nextNode(), testing::Optional(isNode("{{{d}}}", "d")));
+  EXPECT_THAT(Parser.nextNode(), testing::Optional(isNode("e")));
+  EXPECT_THAT(Parser.nextNode(), None);
+
+  Parser.parseLine("{{{}}}{{{tag}}}");
+  EXPECT_THAT(Parser.nextNode(), testing::Optional(isNode("{{{}}}")));
+  EXPECT_THAT(Parser.nextNode(), testing::Optional(isNode("{{{tag}}}", "tag")));
+  EXPECT_THAT(Parser.nextNode(), None);
+
+  Parser.parseLine("\033[0mA\033[1mB\033[30mC\033[37m");
+  EXPECT_THAT(Parser.nextNode(), testing::Optional(isNode("\033[0m")));
+  EXPECT_THAT(Parser.nextNode(), testing::Optional(isNode("A")));
+  EXPECT_THAT(Parser.nextNode(), testing::Optional(isNode("\033[1m")));
+  EXPECT_THAT(Parser.nextNode(), testing::Optional(isNode("B")));
+  EXPECT_THAT(Parser.nextNode(), testing::Optional(isNode("\033[30m")));
+  EXPECT_THAT(Parser.nextNode(), testing::Optional(isNode("C")));
+  EXPECT_THAT(Parser.nextNode(), testing::Optional(isNode("\033[37m")));
+  EXPECT_THAT(Parser.nextNode(), None);
+
+  Parser.parseLine("{{{tag:\033[0m}}}");
+  EXPECT_THAT(Parser.nextNode(),
+              testing::Optional(
+                  isNode("{{{tag:\033[0m}}}", "tag", ElementsAre("\033[0m"))));
+  EXPECT_THAT(Parser.nextNode(), None);
+}
+
+} // namespace
author	Daniel Thornburgh <dthorn@google.com>
	Thu, 7 Apr 2022 23:37:11 +0000 (23:37 +0000)
committer	Daniel Thornburgh <dthorn@google.com>
	Fri, 17 Jun 2022 17:26:24 +0000 (10:26 -0700)
llvm/include/llvm/DebugInfo/Symbolize/Markup.h	[new file with mode: 0644]	patch \| blob
llvm/lib/DebugInfo/Symbolize/CMakeLists.txt		patch \| blob \| history
llvm/lib/DebugInfo/Symbolize/Markup.cpp	[new file with mode: 0644]	patch \| blob
llvm/unittests/DebugInfo/CMakeLists.txt		patch \| blob \| history
llvm/unittests/DebugInfo/Symbolizer/CMakeLists.txt	[new file with mode: 0644]	patch \| blob
llvm/unittests/DebugInfo/Symbolizer/MarkupTest.cpp	[new file with mode: 0644]	patch \| blob