/// command, including command marker.
SmallString<16> VerbatimBlockEndCommandName;
+ /// If true, the commands, html tags, etc will be parsed and reported as
+ /// separate tokens inside the comment body. If false, the comment text will
+ /// be parsed into text and newline tokens.
+ bool ParseCommands;
+
/// Given a character reference name (e.g., "lt"), return the character that
/// it stands for (e.g., "<").
StringRef resolveHTMLNamedCharacterReference(StringRef Name) const;
/// Eat string matching regexp \code \s*\* \endcode.
void skipLineStartingDecorations();
- /// Lex stuff inside comments. CommentEnd should be set correctly.
+ /// Lex comment text, including commands if ParseCommands is set to true.
void lexCommentText(Token &T);
- void setupAndLexVerbatimBlock(Token &T,
- const char *TextBegin,
- char Marker, const CommandInfo *Info);
+ void setupAndLexVerbatimBlock(Token &T, const char *TextBegin, char Marker,
+ const CommandInfo *Info);
void lexVerbatimBlockFirstLine(Token &T);
public:
Lexer(llvm::BumpPtrAllocator &Allocator, DiagnosticsEngine &Diags,
- const CommandTraits &Traits,
- SourceLocation FileLoc,
- const char *BufferStart, const char *BufferEnd);
+ const CommandTraits &Traits, SourceLocation FileLoc,
+ const char *BufferStart, const char *BufferEnd,
+ bool ParseCommands = true);
void lex(Token &T);
- StringRef getSpelling(const Token &Tok,
- const SourceManager &SourceMgr,
+ StringRef getSpelling(const Token &Tok, const SourceManager &SourceMgr,
bool *Invalid = nullptr) const;
};
return extractBriefText(Context);
}
+ /// Returns sanitized comment text, suitable for presentation in editor UIs.
+ /// E.g. will transform:
+ /// // This is a long multiline comment.
+ /// // Parts of it might be indented.
+ /// /* The comments styles might be mixed. */
+ /// into
+ /// "This is a long multiline comment.\n"
+ /// " Parts of it might be indented.\n"
+ /// "The comments styles might be mixed."
+ /// Also removes leading indentation and sanitizes some common cases:
+ /// /* This is a first line.
+ /// * This is a second line. It is indented.
+ /// * This is a third line. */
+ /// and
+ /// /* This is a first line.
+ /// This is a second line. It is indented.
+ /// This is a third line. */
+ /// will both turn into:
+ /// "This is a first line.\n"
+ /// " This is a second line. It is indented.\n"
+ /// "This is a third line."
+ std::string getFormattedText(const SourceManager &SourceMgr,
+ DiagnosticsEngine &Diags) const;
+
/// Parse the comment, assuming it is attached to decl \c D.
comments::FullComment *parse(const ASTContext &Context,
const Preprocessor *PP, const Decl *D) const;
assert(CommentState == LCS_InsideBCPLComment ||
CommentState == LCS_InsideCComment);
+ // Handles lexing non-command text, i.e. text and newline.
+ auto HandleNonCommandToken = [&]() -> void {
+ assert(State == LS_Normal);
+
+ const char *TokenPtr = BufferPtr;
+ assert(TokenPtr < CommentEnd);
+ switch (*TokenPtr) {
+ case '\n':
+ case '\r':
+ TokenPtr = skipNewline(TokenPtr, CommentEnd);
+ formTokenWithChars(T, TokenPtr, tok::newline);
+
+ if (CommentState == LCS_InsideCComment)
+ skipLineStartingDecorations();
+ return;
+
+ default: {
+ StringRef TokStartSymbols = ParseCommands ? "\n\r\\@&<" : "\n\r";
+ size_t End = StringRef(TokenPtr, CommentEnd - TokenPtr)
+ .find_first_of(TokStartSymbols);
+ if (End != StringRef::npos)
+ TokenPtr += End;
+ else
+ TokenPtr = CommentEnd;
+ formTextToken(T, TokenPtr);
+ return;
+ }
+ }
+ };
+
+ if (!ParseCommands)
+ return HandleNonCommandToken();
+
switch (State) {
case LS_Normal:
break;
}
assert(State == LS_Normal);
-
const char *TokenPtr = BufferPtr;
assert(TokenPtr < CommentEnd);
- while (TokenPtr != CommentEnd) {
- switch(*TokenPtr) {
- case '\\':
- case '@': {
- // Commands that start with a backslash and commands that start with
- // 'at' have equivalent semantics. But we keep information about the
- // exact syntax in AST for comments.
- tok::TokenKind CommandKind =
- (*TokenPtr == '@') ? tok::at_command : tok::backslash_command;
+ switch(*TokenPtr) {
+ case '\\':
+ case '@': {
+ // Commands that start with a backslash and commands that start with
+ // 'at' have equivalent semantics. But we keep information about the
+ // exact syntax in AST for comments.
+ tok::TokenKind CommandKind =
+ (*TokenPtr == '@') ? tok::at_command : tok::backslash_command;
+ TokenPtr++;
+ if (TokenPtr == CommentEnd) {
+ formTextToken(T, TokenPtr);
+ return;
+ }
+ char C = *TokenPtr;
+ switch (C) {
+ default:
+ break;
+
+ case '\\': case '@': case '&': case '$':
+ case '#': case '<': case '>': case '%':
+ case '\"': case '.': case ':':
+ // This is one of \\ \@ \& \$ etc escape sequences.
TokenPtr++;
- if (TokenPtr == CommentEnd) {
- formTextToken(T, TokenPtr);
- return;
- }
- char C = *TokenPtr;
- switch (C) {
- default:
- break;
-
- case '\\': case '@': case '&': case '$':
- case '#': case '<': case '>': case '%':
- case '\"': case '.': case ':':
- // This is one of \\ \@ \& \$ etc escape sequences.
+ if (C == ':' && TokenPtr != CommentEnd && *TokenPtr == ':') {
+ // This is the \:: escape sequence.
TokenPtr++;
- if (C == ':' && TokenPtr != CommentEnd && *TokenPtr == ':') {
- // This is the \:: escape sequence.
- TokenPtr++;
- }
- StringRef UnescapedText(BufferPtr + 1, TokenPtr - (BufferPtr + 1));
- formTokenWithChars(T, TokenPtr, tok::text);
- T.setText(UnescapedText);
- return;
}
+ StringRef UnescapedText(BufferPtr + 1, TokenPtr - (BufferPtr + 1));
+ formTokenWithChars(T, TokenPtr, tok::text);
+ T.setText(UnescapedText);
+ return;
+ }
- // Don't make zero-length commands.
- if (!isCommandNameStartCharacter(*TokenPtr)) {
- formTextToken(T, TokenPtr);
- return;
- }
+ // Don't make zero-length commands.
+ if (!isCommandNameStartCharacter(*TokenPtr)) {
+ formTextToken(T, TokenPtr);
+ return;
+ }
- TokenPtr = skipCommandName(TokenPtr, CommentEnd);
- unsigned Length = TokenPtr - (BufferPtr + 1);
-
- // Hardcoded support for lexing LaTeX formula commands
- // \f$ \f[ \f] \f{ \f} as a single command.
- if (Length == 1 && TokenPtr[-1] == 'f' && TokenPtr != CommentEnd) {
- C = *TokenPtr;
- if (C == '$' || C == '[' || C == ']' || C == '{' || C == '}') {
- TokenPtr++;
- Length++;
- }
- }
+ TokenPtr = skipCommandName(TokenPtr, CommentEnd);
+ unsigned Length = TokenPtr - (BufferPtr + 1);
- StringRef CommandName(BufferPtr + 1, Length);
-
- const CommandInfo *Info = Traits.getCommandInfoOrNULL(CommandName);
- if (!Info) {
- if ((Info = Traits.getTypoCorrectCommandInfo(CommandName))) {
- StringRef CorrectedName = Info->Name;
- SourceLocation Loc = getSourceLocation(BufferPtr);
- SourceLocation EndLoc = getSourceLocation(TokenPtr);
- SourceRange FullRange = SourceRange(Loc, EndLoc);
- SourceRange CommandRange(Loc.getLocWithOffset(1), EndLoc);
- Diag(Loc, diag::warn_correct_comment_command_name)
- << FullRange << CommandName << CorrectedName
- << FixItHint::CreateReplacement(CommandRange, CorrectedName);
- } else {
- formTokenWithChars(T, TokenPtr, tok::unknown_command);
- T.setUnknownCommandName(CommandName);
- Diag(T.getLocation(), diag::warn_unknown_comment_command_name)
- << SourceRange(T.getLocation(), T.getEndLocation());
- return;
- }
- }
- if (Info->IsVerbatimBlockCommand) {
- setupAndLexVerbatimBlock(T, TokenPtr, *BufferPtr, Info);
- return;
- }
- if (Info->IsVerbatimLineCommand) {
- setupAndLexVerbatimLine(T, TokenPtr, Info);
- return;
+ // Hardcoded support for lexing LaTeX formula commands
+ // \f$ \f[ \f] \f{ \f} as a single command.
+ if (Length == 1 && TokenPtr[-1] == 'f' && TokenPtr != CommentEnd) {
+ C = *TokenPtr;
+ if (C == '$' || C == '[' || C == ']' || C == '{' || C == '}') {
+ TokenPtr++;
+ Length++;
}
- formTokenWithChars(T, TokenPtr, CommandKind);
- T.setCommandID(Info->getID());
- return;
}
- case '&':
- lexHTMLCharacterReference(T);
- return;
-
- case '<': {
- TokenPtr++;
- if (TokenPtr == CommentEnd) {
- formTextToken(T, TokenPtr);
+ StringRef CommandName(BufferPtr + 1, Length);
+
+ const CommandInfo *Info = Traits.getCommandInfoOrNULL(CommandName);
+ if (!Info) {
+ if ((Info = Traits.getTypoCorrectCommandInfo(CommandName))) {
+ StringRef CorrectedName = Info->Name;
+ SourceLocation Loc = getSourceLocation(BufferPtr);
+ SourceLocation EndLoc = getSourceLocation(TokenPtr);
+ SourceRange FullRange = SourceRange(Loc, EndLoc);
+ SourceRange CommandRange(Loc.getLocWithOffset(1), EndLoc);
+ Diag(Loc, diag::warn_correct_comment_command_name)
+ << FullRange << CommandName << CorrectedName
+ << FixItHint::CreateReplacement(CommandRange, CorrectedName);
+ } else {
+ formTokenWithChars(T, TokenPtr, tok::unknown_command);
+ T.setUnknownCommandName(CommandName);
+ Diag(T.getLocation(), diag::warn_unknown_comment_command_name)
+ << SourceRange(T.getLocation(), T.getEndLocation());
return;
}
- const char C = *TokenPtr;
- if (isHTMLIdentifierStartingCharacter(C))
- setupAndLexHTMLStartTag(T);
- else if (C == '/')
- setupAndLexHTMLEndTag(T);
- else
- formTextToken(T, TokenPtr);
+ }
+ if (Info->IsVerbatimBlockCommand) {
+ setupAndLexVerbatimBlock(T, TokenPtr, *BufferPtr, Info);
return;
}
-
- case '\n':
- case '\r':
- TokenPtr = skipNewline(TokenPtr, CommentEnd);
- formTokenWithChars(T, TokenPtr, tok::newline);
-
- if (CommentState == LCS_InsideCComment)
- skipLineStartingDecorations();
+ if (Info->IsVerbatimLineCommand) {
+ setupAndLexVerbatimLine(T, TokenPtr, Info);
return;
+ }
+ formTokenWithChars(T, TokenPtr, CommandKind);
+ T.setCommandID(Info->getID());
+ return;
+ }
- default: {
- size_t End = StringRef(TokenPtr, CommentEnd - TokenPtr).
- find_first_of("\n\r\\@&<");
- if (End != StringRef::npos)
- TokenPtr += End;
- else
- TokenPtr = CommentEnd;
+ case '&':
+ lexHTMLCharacterReference(T);
+ return;
+
+ case '<': {
+ TokenPtr++;
+ if (TokenPtr == CommentEnd) {
formTextToken(T, TokenPtr);
return;
}
+ const char C = *TokenPtr;
+ if (isHTMLIdentifierStartingCharacter(C))
+ setupAndLexHTMLStartTag(T);
+ else if (C == '/')
+ setupAndLexHTMLEndTag(T);
+ else
+ formTextToken(T, TokenPtr);
+ return;
}
+
+ default:
+ return HandleNonCommandToken();
}
}
}
Lexer::Lexer(llvm::BumpPtrAllocator &Allocator, DiagnosticsEngine &Diags,
- const CommandTraits &Traits,
- SourceLocation FileLoc,
- const char *BufferStart, const char *BufferEnd):
- Allocator(Allocator), Diags(Diags), Traits(Traits),
- BufferStart(BufferStart), BufferEnd(BufferEnd),
- FileLoc(FileLoc), BufferPtr(BufferStart),
- CommentState(LCS_BeforeComment), State(LS_Normal) {
-}
+ const CommandTraits &Traits, SourceLocation FileLoc,
+ const char *BufferStart, const char *BufferEnd,
+ bool ParseCommands)
+ : Allocator(Allocator), Diags(Diags), Traits(Traits),
+ BufferStart(BufferStart), BufferEnd(BufferEnd), FileLoc(FileLoc),
+ BufferPtr(BufferStart), CommentState(LCS_BeforeComment), State(LS_Normal),
+ ParseCommands(ParseCommands) {}
void Lexer::lex(Token &T) {
again:
BeforeThanCompare<RawComment>(SourceMgr));
std::swap(Comments, MergedComments);
}
+
+std::string RawComment::getFormattedText(const SourceManager &SourceMgr,
+ DiagnosticsEngine &Diags) const {
+ llvm::StringRef CommentText = getRawText(SourceMgr);
+ if (CommentText.empty())
+ return "";
+
+ llvm::BumpPtrAllocator Allocator;
+ // We do not parse any commands, so CommentOptions are ignored by
+ // comments::Lexer. Therefore, we just use default-constructed options.
+ CommentOptions DefOpts;
+ comments::CommandTraits EmptyTraits(Allocator, DefOpts);
+ comments::Lexer L(Allocator, Diags, EmptyTraits, getSourceRange().getBegin(),
+ CommentText.begin(), CommentText.end(),
+ /*ParseCommands=*/false);
+
+ std::string Result;
+ // A column number of the first non-whitespace token in the comment text.
+ // We skip whitespace up to this column, but keep the whitespace after this
+ // column. IndentColumn is calculated when lexing the first line and reused
+ // for the rest of lines.
+ unsigned IndentColumn = 0;
+
+ // Processes one line of the comment and adds it to the result.
+ // Handles skipping the indent at the start of the line.
+ // Returns false when eof is reached and true otherwise.
+ auto LexLine = [&](bool IsFirstLine) -> bool {
+ comments::Token Tok;
+ // Lex the first token on the line. We handle it separately, because we to
+ // fix up its indentation.
+ L.lex(Tok);
+ if (Tok.is(comments::tok::eof))
+ return false;
+ if (Tok.is(comments::tok::newline)) {
+ Result += "\n";
+ return true;
+ }
+ llvm::StringRef TokText = L.getSpelling(Tok, SourceMgr);
+ bool LocInvalid = false;
+ unsigned TokColumn =
+ SourceMgr.getSpellingColumnNumber(Tok.getLocation(), &LocInvalid);
+ assert(!LocInvalid && "getFormattedText for invalid location");
+
+ // Amount of leading whitespace in TokText.
+ size_t WhitespaceLen = TokText.find_first_not_of(" \t");
+ if (WhitespaceLen == StringRef::npos)
+ WhitespaceLen = TokText.size();
+ // Remember the amount of whitespace we skipped in the first line to remove
+ // indent up to that column in the following lines.
+ if (IsFirstLine)
+ IndentColumn = TokColumn + WhitespaceLen;
+
+ // Amount of leading whitespace we actually want to skip.
+ // For the first line we skip all the whitespace.
+ // For the rest of the lines, we skip whitespace up to IndentColumn.
+ unsigned SkipLen =
+ IsFirstLine
+ ? WhitespaceLen
+ : std::min<size_t>(
+ WhitespaceLen,
+ std::max<int>(static_cast<int>(IndentColumn) - TokColumn, 0));
+ llvm::StringRef Trimmed = TokText.drop_front(SkipLen);
+ Result += Trimmed;
+ // Lex all tokens in the rest of the line.
+ for (L.lex(Tok); Tok.isNot(comments::tok::eof); L.lex(Tok)) {
+ if (Tok.is(comments::tok::newline)) {
+ Result += "\n";
+ return true;
+ }
+ Result += L.getSpelling(Tok, SourceMgr);
+ }
+ // We've reached the end of file token.
+ return false;
+ };
+
+ auto DropTrailingNewLines = [](std::string &Str) {
+ while (Str.back() == '\n')
+ Str.pop_back();
+ };
+
+ // Proces first line separately to remember indent for the following lines.
+ if (!LexLine(/*IsFirstLine=*/true)) {
+ DropTrailingNewLines(Result);
+ return Result;
+ }
+ // Process the rest of the lines.
+ while (LexLine(/*IsFirstLine=*/false))
+ ;
+ DropTrailingNewLines(Result);
+ return Result;
+}
ASTVectorTest.cpp
CommentLexer.cpp
CommentParser.cpp
+ CommentTextTest.cpp
DataCollectionTest.cpp
DeclPrinterTest.cpp
DeclTest.cpp
--- /dev/null
+//===- unittest/AST/CommentTextTest.cpp - Comment text extraction test ----===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Tests for user-friendly output formatting of comments, i.e.
+// RawComment::getFormattedText().
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang/AST/RawCommentList.h"
+#include "clang/Basic/CommentOptions.h"
+#include "clang/Basic/Diagnostic.h"
+#include "clang/Basic/DiagnosticIDs.h"
+#include "clang/Basic/FileManager.h"
+#include "clang/Basic/FileSystemOptions.h"
+#include "clang/Basic/SourceLocation.h"
+#include "clang/Basic/SourceManager.h"
+#include "clang/Basic/VirtualFileSystem.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include <gtest/gtest.h>
+
+namespace clang {
+
+class CommentTextTest : public ::testing::Test {
+protected:
+ std::string formatComment(llvm::StringRef CommentText) {
+ SourceManagerForFile FileSourceMgr("comment-test.cpp", CommentText);
+ SourceManager& SourceMgr = FileSourceMgr.get();
+
+ auto CommentStartOffset = CommentText.find("/");
+ assert(CommentStartOffset != llvm::StringRef::npos);
+ FileID File = SourceMgr.getMainFileID();
+
+ SourceRange CommentRange(
+ SourceMgr.getLocForStartOfFile(File).getLocWithOffset(
+ CommentStartOffset),
+ SourceMgr.getLocForEndOfFile(File));
+ CommentOptions EmptyOpts;
+ // FIXME: technically, merged that we set here is incorrect, but that
+ // shouldn't matter.
+ RawComment Comment(SourceMgr, CommentRange, EmptyOpts, /*Merged=*/true);
+ DiagnosticsEngine Diags(new DiagnosticIDs, new DiagnosticOptions);
+ return Comment.getFormattedText(SourceMgr, Diags);
+ }
+};
+
+TEST_F(CommentTextTest, FormattedText) {
+ // clang-format off
+ auto ExpectedOutput =
+R"(This function does this and that.
+For example,
+ Runnning it in that case will give you
+ this result.
+That's about it.)";
+ // Two-slash comments.
+ EXPECT_EQ(ExpectedOutput, formatComment(
+R"cpp(
+// This function does this and that.
+// For example,
+// Runnning it in that case will give you
+// this result.
+// That's about it.)cpp"));
+
+ // Three-slash comments.
+ EXPECT_EQ(ExpectedOutput, formatComment(
+R"cpp(
+/// This function does this and that.
+/// For example,
+/// Runnning it in that case will give you
+/// this result.
+/// That's about it.)cpp"));
+
+ // Block comments.
+ EXPECT_EQ(ExpectedOutput, formatComment(
+R"cpp(
+/* This function does this and that.
+ * For example,
+ * Runnning it in that case will give you
+ * this result.
+ * That's about it.*/)cpp"));
+
+ // Doxygen-style block comments.
+ EXPECT_EQ(ExpectedOutput, formatComment(
+R"cpp(
+/** This function does this and that.
+ * For example,
+ * Runnning it in that case will give you
+ * this result.
+ * That's about it.*/)cpp"));
+
+ // Weird indentation.
+ EXPECT_EQ(ExpectedOutput, formatComment(
+R"cpp(
+ // This function does this and that.
+ // For example,
+ // Runnning it in that case will give you
+ // this result.
+ // That's about it.)cpp"));
+ // clang-format on
+}
+
+TEST_F(CommentTextTest, KeepsDoxygenControlSeqs) {
+ // clang-format off
+ auto ExpectedOutput =
+R"(\brief This is the brief part of the comment.
+\param a something about a.
+@param b something about b.)";
+
+ EXPECT_EQ(ExpectedOutput, formatComment(
+R"cpp(
+/// \brief This is the brief part of the comment.
+/// \param a something about a.
+/// @param b something about b.)cpp"));
+ // clang-format on
+}
+
+} // namespace clang