From: Seth Cantrell <seth.cantrell@gmail.com>
Date: Sun, 28 Oct 2012 18:24:46 +0000 (+0000)
Subject: improve highlighting of invalid string encodings
X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=4cfc817a9ad9f30f3380f7f5cd816f65cd1d4c49;p=platform%2Fupstream%2Fllvm.git

improve highlighting of invalid string encodings

limit highlight to exactly the bad encoding, and highlight every
bad encoding in a string.

llvm-svn: 166900
---

diff --git a/clang/lib/Lex/LiteralSupport.cpp b/clang/lib/Lex/LiteralSupport.cpp
index 9171449..2896dc3 100644
--- a/clang/lib/Lex/LiteralSupport.cpp
+++ b/clang/lib/Lex/LiteralSupport.cpp
@@ -49,6 +49,20 @@ static unsigned getCharWidth(tok::TokenKind kind, const TargetInfo &Target) {
   }
 }
 
+static CharSourceRange MakeCharSourceRange(const LangOptions &Features,
+                                           FullSourceLoc TokLoc,
+                                           const char *TokBegin,
+                                           const char *TokRangeBegin,
+                                           const char *TokRangeEnd) {
+  SourceLocation Begin =
+    Lexer::AdvanceToTokenCharacter(TokLoc, TokRangeBegin - TokBegin,
+                                   TokLoc.getManager(), Features);
+  SourceLocation End =
+    Lexer::AdvanceToTokenCharacter(Begin, TokRangeEnd - TokRangeBegin,
+                                   TokLoc.getManager(), Features);
+  return CharSourceRange::getCharRange(Begin, End);
+}
+
 /// \brief Produce a diagnostic highlighting some portion of a literal.
 ///
 /// Emits the diagnostic \p DiagID, highlighting the range of characters from
@@ -61,11 +75,8 @@ static DiagnosticBuilder Diag(DiagnosticsEngine *Diags,
   SourceLocation Begin =
     Lexer::AdvanceToTokenCharacter(TokLoc, TokRangeBegin - TokBegin,
                                    TokLoc.getManager(), Features);
-  SourceLocation End =
-    Lexer::AdvanceToTokenCharacter(Begin, TokRangeEnd - TokRangeBegin,
-                                   TokLoc.getManager(), Features);
-  return Diags->Report(Begin, DiagID)
-      << CharSourceRange::getCharRange(Begin, End);
+  return Diags->Report(Begin, DiagID) <<
+    MakeCharSourceRange(Features, TokLoc, TokBegin, TokRangeBegin, TokRangeEnd);
 }
 
 /// ProcessCharEscape - Parse a standard C escape sequence, which can occur in
@@ -1372,6 +1383,15 @@ void StringLiteralParser::init(const Token *StringToks, unsigned NumStringToks){
   }
 }
 
+static const char *resync_utf8(const char *err, const char *end) {
+    if (err==end)
+        return end;
+    end = err + std::min<unsigned>(getNumBytesForUTF8(*err), end-err);
+    while (++err!=end && (*err&0xC0)==0x80)
+      ;
+    return err;
+}
+
 /// \brief This function copies from Fragment, which is a sequence of bytes
 /// within Tok's contents (which begin at TokBegin) into ResultPtr.
 /// Performs widening for multi-byte characters.
@@ -1381,7 +1401,6 @@ bool StringLiteralParser::CopyStringFragment(const Token &Tok,
   const UTF8 *ErrorPtrTmp;
   if (ConvertUTF8toWide(CharByteWidth, Fragment, ResultPtr, ErrorPtrTmp))
     return false;
-  const char *ErrorPtr = reinterpret_cast<const char *>(ErrorPtrTmp);
 
   // If we see bad encoding for unprefixed string literals, warn and
   // simply copy the byte values, for compatibility with gcc and older
@@ -1391,12 +1410,31 @@ bool StringLiteralParser::CopyStringFragment(const Token &Tok,
     memcpy(ResultPtr, Fragment.data(), Fragment.size());
     ResultPtr += Fragment.size();
   }
+
   if (Diags) {
-    Diag(Diags, Features, FullSourceLoc(Tok.getLocation(), SM), TokBegin,
-         ErrorPtr, ErrorPtr + std::min<unsigned>(getNumBytesForUTF8(*ErrorPtr),
-                                                 Fragment.end() - ErrorPtr),
-         NoErrorOnBadEncoding ? diag::warn_bad_string_encoding
-                              : diag::err_bad_string_encoding);
+    const char *ErrorPtr = reinterpret_cast<const char *>(ErrorPtrTmp);
+
+    FullSourceLoc SourceLoc(Tok.getLocation(), SM);
+    const DiagnosticBuilder &Builder =
+      Diag(Diags, Features, SourceLoc, TokBegin,
+           ErrorPtr, resync_utf8(ErrorPtr, Fragment.end()),
+           NoErrorOnBadEncoding ? diag::warn_bad_string_encoding
+                                : diag::err_bad_string_encoding);
+
+    char *SavedResultPtr = ResultPtr;
+    const char *NextStart = resync_utf8(ErrorPtr, Fragment.end());
+    StringRef NextFragment(NextStart, Fragment.end()-NextStart);
+
+    while (!ConvertUTF8toWide(CharByteWidth, NextFragment, ResultPtr,
+                              ErrorPtrTmp)) {
+      const char *ErrorPtr = reinterpret_cast<const char *>(ErrorPtrTmp);
+      NextStart = resync_utf8(ErrorPtr, Fragment.end());
+      Builder << MakeCharSourceRange(Features, SourceLoc, TokBegin,
+                                     ErrorPtr, NextStart);
+      NextFragment = StringRef(NextStart, Fragment.end()-NextStart);
+    }
+
+    ResultPtr = SavedResultPtr;
   }
   return !NoErrorOnBadEncoding;
 }
diff --git a/clang/test/Misc/wrong-encoding.c b/clang/test/Misc/wrong-encoding.c
index 476c783..db37af9 100644
--- a/clang/test/Misc/wrong-encoding.c
+++ b/clang/test/Misc/wrong-encoding.c
@@ -1,16 +1,33 @@
-// RUN: %clang_cc1 -fsyntax-only %s 2>&1 | FileCheck -strict-whitespace %s
+// RUN: %clang_cc1 -fsyntax-only -Wno-unused-value %s 2>&1 | FileCheck -strict-whitespace %s
 
 void foo() {
 
   "§Ã"; // ø
 // CHECK: {{^  "<A7><C3>"; // <F8>}}
-// CHECK: {{^   \^~~~}}
+// CHECK: {{^   \^~~~~~~}}
 
   /* þ« */ const char *d = "¥";
 
 // CHECK: {{^  /\* <FE><AB> \*/ const char \*d = "<A5>";}}
 // CHECK: {{^                                  \^~~~}}
 
-// CHECK: {{^  "<A7><C3>"; // <F8>}}
-// CHECK: {{^  \^~~~~~~~~~}}
+  "xxé¿¿¿d";
+// CHECK: {{^  "xx<U\+9FFF><BF>d";}}
+// CHECK: {{^             \^~~~}}
+
+  "xxé¿bcd";
+// CHECK: {{^  "xx<E9><BF>bcd";}}
+// CHECK: {{^     \^~~~~~~~}}
+
+  "xxéabcd";
+// CHECK: {{^  "xx<E9>abcd";}}
+// CHECK: {{^     \^~~~}}
+
+  "xxé¿é¿d";
+// CHECK: {{^  "xx<E9><BF><E9><BF>d";}}
+// CHECK: {{^     \^~~~~~~~~~~~~~~}}
+
+  "xxé¿xxxxxxxxxxxxxxxxxxxxxé¿xx";
+// CHECK: {{^  "xx<E9><BF>xxxxxxxxxxxxxxxxxxxxx<E9><BF>xx";}}
+// CHECK: {{^     \^~~~~~~~                     ~~~~~~~~}}
 }