[clangd] Fix invalid UTF8 when extracting doc comments.
authorSam McCall <sam.mccall@gmail.com>
Wed, 30 Sep 2020 13:45:13 +0000 (15:45 +0200)
committerSam McCall <sam.mccall@gmail.com>
Wed, 30 Sep 2020 14:05:12 +0000 (16:05 +0200)
Differential Revision: https://reviews.llvm.org/D88567

clang-tools-extra/clangd/CodeCompletionStrings.cpp
clang-tools-extra/clangd/unittests/CodeCompletionStringsTests.cpp
clang-tools-extra/clangd/unittests/SymbolCollectorTests.cpp

index ef44c15..d4a3bda 100644 (file)
@@ -12,6 +12,7 @@
 #include "clang/AST/RawCommentList.h"
 #include "clang/Basic/SourceManager.h"
 #include "clang/Sema/CodeCompleteConsumer.h"
+#include "llvm/Support/JSON.h"
 #include <limits>
 #include <utility>
 
@@ -86,7 +87,12 @@ std::string getDeclComment(const ASTContext &Ctx, const NamedDecl &Decl) {
   assert(!Ctx.getSourceManager().isLoadedSourceLocation(RC->getBeginLoc()));
   std::string Doc =
       RC->getFormattedText(Ctx.getSourceManager(), Ctx.getDiagnostics());
-  return looksLikeDocComment(Doc) ? Doc : "";
+  if (!looksLikeDocComment(Doc))
+    return "";
+  // Clang requires source to be UTF-8, but doesn't enforce this in comments.
+  if (!llvm::json::isUTF8(Doc))
+    Doc = llvm::json::fixUTF8(Doc);
+  return Doc;
 }
 
 void getSignature(const CodeCompletionString &CCS, std::string *Signature,
index 2531922..7aace93 100644 (file)
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "CodeCompletionStrings.h"
+#include "TestTU.h"
 #include "clang/Sema/CodeCompleteConsumer.h"
 #include "gmock/gmock.h"
 #include "gtest/gtest.h"
@@ -56,6 +57,14 @@ TEST_F(CompletionStringTest, DocumentationWithAnnotation) {
             "Annotation: Ano\n\nIs this brief?");
 }
 
+TEST_F(CompletionStringTest, GetDeclCommentBadUTF8) {
+  // <ff> is not a valid byte here, should be replaced by encoded <U+FFFD>.
+  auto TU = TestTU::withCode("/*x\xffy*/ struct X;");
+  auto AST = TU.build();
+  EXPECT_EQ("x\xef\xbf\xbdy",
+            getDeclComment(AST.getASTContext(), findDecl(AST, "X")));
+}
+
 TEST_F(CompletionStringTest, MultipleAnnotations) {
   Builder.AddAnnotation("Ano1");
   Builder.AddAnnotation("Ano2");
index 3940946..80995ba 100644 (file)
@@ -1606,11 +1606,11 @@ TEST_F(SymbolCollectorTest, BadUTF8) {
   // Extracted from boost/spirit/home/support/char_encoding/iso8859_1.hpp
   // This looks like UTF-8 and fools clang, but has high-ISO-8859-1 comments.
   const char *Header = "int PUNCT = 0;\n"
-                       "int types[] = { /* \xa1 */PUNCT };";
+                       "/* \xa1 */ int types[] = { /* \xa1 */PUNCT };";
   CollectorOpts.RefFilter = RefKind::All;
   CollectorOpts.RefsInHeaders = true;
   runSymbolCollector(Header, "");
-  EXPECT_THAT(Symbols, Contains(QName("types")));
+  EXPECT_THAT(Symbols, Contains(AllOf(QName("types"), Doc("\xef\xbf\xbd "))));
   EXPECT_THAT(Symbols, Contains(QName("PUNCT")));
   // Reference is stored, although offset within line is not reliable.
   EXPECT_THAT(Refs, Contains(Pair(findSymbol(Symbols, "PUNCT").ID, _)));