[Clang] Add a warning on invalid UTF-8 in comments.

author Corentin Jabot <corentinjabot@gmail.com>

Fri, 17 Jun 2022 14:23:41 +0000 (16:23 +0200)

committer Corentin Jabot <corentinjabot@gmail.com>

Sat, 9 Jul 2022 09:26:45 +0000 (11:26 +0200)
author Corentin Jabot <corentinjabot@gmail.com>
Fri, 17 Jun 2022 14:23:41 +0000 (16:23 +0200)
committer Corentin Jabot <corentinjabot@gmail.com>
Sat, 9 Jul 2022 09:26:45 +0000 (11:26 +0200)
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst

index 5dae620..da14489 100644 (file)
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -279,9 +279,11 @@ Improvements to Clang's diagnostics
    unevaluated operands of a ``typeid`` expression, as they are now
    modeled correctly in the CFG. This fixes
    `Issue 21668 <https://github.com/llvm/llvm-project/issues/21668>`_.
-- ``-Wself-assign``, ``-Wself-assign-overloaded`` and ``-Wself-move`` will 
+- ``-Wself-assign``, ``-Wself-assign-overloaded`` and ``-Wself-move`` will
    suggest a fix if the decl being assigned is a parameter that shadows a data
    member of the contained class.
+- Added ``-Winvalid-utf8`` which diagnoses invalid UTF-8 code unit sequences in
+  comments.
  
  Non-comprehensive list of changes in this release
  -------------------------------------------------
@@ -592,7 +594,7 @@ AST Matchers
  
  - Added ``forEachTemplateArgument`` matcher which creates a match every
    time a ``templateArgument`` matches the matcher supplied to it.
-  
+
  - Added ``objcStringLiteral`` matcher which matches ObjectiveC String
    literal expressions.
  
diff --git a/clang/include/clang/Basic/DiagnosticLexKinds.td b/clang/include/clang/Basic/DiagnosticLexKinds.td

index ac86076..38ee022 100644 (file)
--- a/clang/include/clang/Basic/DiagnosticLexKinds.td
+++ b/clang/include/clang/Basic/DiagnosticLexKinds.td
@@ -113,6 +113,8 @@ def warn_four_char_character_literal : Warning<
  // Unicode and UCNs
  def err_invalid_utf8 : Error<
    "source file is not valid UTF-8">;
+def warn_invalid_utf8_in_comment : Extension<
+  "invalid UTF-8 in comment">, InGroup<DiagGroup<"invalid-utf8">>;
  def err_character_not_allowed : Error<
    "unexpected character <U+%0>">;
  def err_character_not_allowed_identifier : Error<
diff --git a/clang/lib/Lex/Lexer.cpp b/clang/lib/Lex/Lexer.cpp

index 6820057..799f301 100644 (file)
--- a/clang/lib/Lex/Lexer.cpp
+++ b/clang/lib/Lex/Lexer.cpp
@@ -2392,13 +2392,37 @@ bool Lexer::SkipLineComment(Token &Result, const char *CurPtr,
    //
    // This loop terminates with CurPtr pointing at the newline (or end of buffer)
    // character that ends the line comment.
+
+  // C++23 [lex.phases] p1
+  // Diagnose invalid UTF-8 if the corresponding warning is enabled, emitting a
+  // diagnostic only once per entire ill-formed subsequence to avoid
+  // emiting to many diagnostics (see http://unicode.org/review/pr-121.html).
+  bool UnicodeDecodingAlreadyDiagnosed = false;
+
    char C;
    while (true) {
      C = *CurPtr;
      // Skip over characters in the fast loop.
-    while (C != 0 &&                // Potentially EOF.
-           C != '\n' && C != '\r')  // Newline or DOS-style newline.
+    while (isASCII(C) && C != 0 &&   // Potentially EOF.
+           C != '\n' && C != '\r') { // Newline or DOS-style newline.
        C = *++CurPtr;
+      UnicodeDecodingAlreadyDiagnosed = false;
+    }
+
+    if (!isASCII(C)) {
+      unsigned Length = llvm::getUTF8SequenceSize(
+          (const llvm::UTF8 *)CurPtr, (const llvm::UTF8 *)BufferEnd);
+      if (Length == 0) {
+        if (!UnicodeDecodingAlreadyDiagnosed && !isLexingRawMode())
+          Diag(CurPtr, diag::warn_invalid_utf8_in_comment);
+        UnicodeDecodingAlreadyDiagnosed = true;
+        ++CurPtr;
+      } else {
+        UnicodeDecodingAlreadyDiagnosed = false;
+        CurPtr += Length;
+      }
+      continue;
+    }
  
      const char *NextLine = CurPtr;
      if (C != 0) {
@@ -2665,6 +2689,12 @@ bool Lexer::SkipBlockComment(Token &Result, const char *CurPtr,
    if (C == '/')
      C = *CurPtr++;
  
+  // C++23 [lex.phases] p1
+  // Diagnose invalid UTF-8 if the corresponding warning is enabled, emitting a
+  // diagnostic only once per entire ill-formed subsequence to avoid
+  // emiting to many diagnostics (see http://unicode.org/review/pr-121.html).
+  bool UnicodeDecodingAlreadyDiagnosed = false;
+
    while (true) {
      // Skip over all non-interesting characters until we find end of buffer or a
      // (probably ending) '/' character.
@@ -2673,14 +2703,22 @@ bool Lexer::SkipBlockComment(Token &Result, const char *CurPtr,
          // doesn't check for '\0'.
          !(PP && PP->getCodeCompletionFileLoc() == FileLoc)) {
        // While not aligned to a 16-byte boundary.
-      while (C != '/' && ((intptr_t)CurPtr & 0x0F) != 0)
+      while (C != '/' && (intptr_t)CurPtr % 16 != 0) {
+        if (!isASCII(C))
+          goto MultiByteUTF8;
          C = *CurPtr++;
-
+      }
        if (C == '/') goto FoundSlash;
  
  #ifdef __SSE2__
        __m128i Slashes = _mm_set1_epi8('/');
-      while (CurPtr+16 <= BufferEnd) {
+      while (CurPtr + 16 < BufferEnd) {
+        int Mask = _mm_movemask_epi8(*(const __m128i *)CurPtr);
+        if (LLVM_UNLIKELY(Mask != 0)) {
+          CurPtr += llvm::countTrailingZeros<unsigned>(Mask);
+          goto MultiByteUTF8;
+        }
+        // look for slashes
          int cmp = _mm_movemask_epi8(_mm_cmpeq_epi8(*(const __m128i*)CurPtr,
                                      Slashes));
          if (cmp != 0) {
@@ -2693,21 +2731,39 @@ bool Lexer::SkipBlockComment(Token &Result, const char *CurPtr,
          CurPtr += 16;
        }
  #elif __ALTIVEC__
+      __vector unsigned char LongUTF = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+                                        0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+                                        0x80, 0x80, 0x80, 0x80};
        __vector unsigned char Slashes = {
          '/', '/', '/', '/',  '/', '/', '/', '/',
          '/', '/', '/', '/',  '/', '/', '/', '/'
        };
-      while (CurPtr + 16 <= BufferEnd &&
-             !vec_any_eq(*(const __vector unsigned char *)CurPtr, Slashes))
+      while (CurPtr + 16 < BufferEnd) {
+        if (LLVM_UNLIKELY(
+                vec_any_ge(*(const __vector unsigned char *)CurPtr, LongUTF)))
+          goto MultiByteUTF8;
+        if (vec_any_eq(*(const __vector unsigned char *)CurPtr, Slashes)) {
+          C = *CurPtr++;
+          break;
+        }
          CurPtr += 16;
+      }
+
  #else
-      // Scan for '/' quickly.  Many block comments are very large.
-      while (CurPtr[0] != '/' &&
-             CurPtr[1] != '/' &&
-             CurPtr[2] != '/' &&
-             CurPtr[3] != '/' &&
-             CurPtr+4 < BufferEnd) {
-        CurPtr += 4;
+      while (CurPtr + 16 < BufferEnd) {
+        bool HasNonASCII = false;
+        for (unsigned I = 0; I < 16; ++I)
+          HasNonASCII |= !isASCII(CurPtr[I]);
+
+        if (LLVM_UNLIKELY(HasNonASCII))
+          goto MultiByteUTF8;
+
+        bool HasSlash = false;
+        for (unsigned I = 0; I < 16; ++I)
+          HasSlash |= CurPtr[I] == '/';
+        if (HasSlash)
+          break;
+        CurPtr += 16;
        }
  #endif
  
@@ -2715,9 +2771,31 @@ bool Lexer::SkipBlockComment(Token &Result, const char *CurPtr,
        C = *CurPtr++;
      }
  
-    // Loop to scan the remainder.
-    while (C != '/' && C != '\0')
+    // Loop to scan the remainder, warning on invalid UTF-8
+    // if the corresponding warning is enabled, emitting a diagnostic only once
+    // per sequence that cannot be decoded.
+    while (C != '/' && C != '\0') {
+      if (isASCII(C)) {
+        UnicodeDecodingAlreadyDiagnosed = false;
+        C = *CurPtr++;
+        continue;
+      }
+    MultiByteUTF8:
+      // CurPtr is 1 code unit past C, so to decode
+      // the codepoint, we need to read from the previous position.
+      unsigned Length = llvm::getUTF8SequenceSize(
+          (const llvm::UTF8 *)CurPtr-1, (const llvm::UTF8 *)BufferEnd);
+      if (Length == 0) {
+        if (!UnicodeDecodingAlreadyDiagnosed && !isLexingRawMode())
+          Diag(CurPtr-1, diag::warn_invalid_utf8_in_comment);
+        UnicodeDecodingAlreadyDiagnosed = true;
+      }
+      else {
+        UnicodeDecodingAlreadyDiagnosed = false;
+        CurPtr += Length - 1;
+      }
        C = *CurPtr++;
+    }
  
      if (C == '/') {
    FoundSlash:
diff --git a/clang/test/Lexer/comment-invalid-utf8.c b/clang/test/Lexer/comment-invalid-utf8.c

new file mode 100644 (file)

index 0000000..b8bf551
--- /dev/null
+++ b/clang/test/Lexer/comment-invalid-utf8.c
@@ -0,0 +1,27 @@
+// RUN: %clang_cc1 -fsyntax-only %s -Winvalid-utf8 -verify=expected
+// RUN: %clang_cc1 -fsyntax-only %s -verify=nowarn
+// nowarn-no-diagnostics
+
+// This file is purposefully encoded as windows-1252
+// be careful when modifying.
+
+//\80
+// expected-warning@-1 {{invalid UTF-8 in comment}}
+
+// \80 \82\83\84\85\86\87\88\89 \8a \8b \8c \8e
+// expected-warning@-1 6{{invalid UTF-8 in comment}}
+
+/*\80*/
+// expected-warning@-1 {{invalid UTF-8 in comment}}
+
+/*\80 \82\83\84\85\86\87\88\89 \8a \8b \8c \8e*/
+// expected-warning@-1 6{{invalid UTF-8 in comment}}
+
+/*
+\80
+*/
+// expected-warning@-2 {{invalid UTF-8 in comment}}
+
+// abcd
+// \80abcd
+// expected-warning@-1 {{invalid UTF-8 in comment}}
diff --git a/clang/test/Lexer/comment-utf8.c b/clang/test/Lexer/comment-utf8.c

new file mode 100644 (file)

index 0000000..87f2d13
--- /dev/null
+++ b/clang/test/Lexer/comment-utf8.c
@@ -0,0 +1,20 @@
+// RUN: %clang_cc1 -fsyntax-only %s -Winvalid-utf8 -verify
+// expected-no-diagnostics
+
+
+//§ § § 😀 你好 ©
+
+/*§ § § 😀 你好 ©*/
+
+/*
+§ § § 😀 你好 ©©©
+*/
+
+/* § § § 😀 你好 © */
+/*
+    a longer comment to exerce the vectorized code path
+    ----------------------------------------------------
+    αααααααααααααααααααααα      // here is some unicode
+    ----------------------------------------------------
+    ----------------------------------------------------
+*/
diff --git a/clang/test/SemaCXX/static-assert.cpp b/clang/test/SemaCXX/static-assert.cpp

index 5801320..2ac0dfd 100644 (file)
--- a/clang/test/SemaCXX/static-assert.cpp
+++ b/clang/test/SemaCXX/static-assert.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fsyntax-only -verify %s -std=c++11 -pedantic -triple=x86_64-linux-gnu
+// RUN: %clang_cc1 -fsyntax-only -verify %s -std=c++11 -pedantic -triple=x86_64-linux-gnu -Wno-invalid-utf8
  
  int f(); // expected-note {{declared here}}
  
diff --git a/llvm/include/llvm/Support/ConvertUTF.h b/llvm/include/llvm/Support/ConvertUTF.h

index 662f3ac..1e05cfe 100644 (file)
--- a/llvm/include/llvm/Support/ConvertUTF.h
+++ b/llvm/include/llvm/Support/ConvertUTF.h
@@ -181,6 +181,8 @@ Boolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd);
  
  Boolean isLegalUTF8String(const UTF8 **source, const UTF8 *sourceEnd);
  
+unsigned getUTF8SequenceSize(const UTF8 *source, const UTF8 *sourceEnd);
+
  unsigned getNumBytesForUTF8(UTF8 firstByte);
  
  /*************************************************************************/
diff --git a/llvm/lib/Support/ConvertUTF.cpp b/llvm/lib/Support/ConvertUTF.cpp

index e24a918..cc411fa 100644 (file)
--- a/llvm/lib/Support/ConvertUTF.cpp
+++ b/llvm/lib/Support/ConvertUTF.cpp
@@ -417,6 +417,16 @@ Boolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd) {
      return isLegalUTF8(source, length);
  }
  
+/*
+ * Exported function to return the size of the first utf-8 code unit sequence,
+ * Or 0 if the sequence is not valid;
+ */
+unsigned getUTF8SequenceSize(const UTF8 *source, const UTF8 *sourceEnd) {
+  int length = trailingBytesForUTF8[*source] + 1;
+  return (length <= sourceEnd - source && isLegalUTF8(source, length)) ? length
+                                                                         : 0;
+}
+
  /* --------------------------------------------------------------------- */
  
  static unsigned
author	Corentin Jabot <corentinjabot@gmail.com>
	Fri, 17 Jun 2022 14:23:41 +0000 (16:23 +0200)
committer	Corentin Jabot <corentinjabot@gmail.com>
	Sat, 9 Jul 2022 09:26:45 +0000 (11:26 +0200)
clang/docs/ReleaseNotes.rst		patch \| blob \| history
clang/include/clang/Basic/DiagnosticLexKinds.td		patch \| blob \| history
clang/lib/Lex/Lexer.cpp		patch \| blob \| history
clang/test/Lexer/comment-invalid-utf8.c	[new file with mode: 0644]	patch \| blob
clang/test/Lexer/comment-utf8.c	[new file with mode: 0644]	patch \| blob
clang/test/SemaCXX/static-assert.cpp		patch \| blob \| history
llvm/include/llvm/Support/ConvertUTF.h		patch \| blob \| history
llvm/lib/Support/ConvertUTF.cpp		patch \| blob \| history