From 7f43dddae0669f0700cf04bf520c12d3301cd809 Mon Sep 17 00:00:00 2001
From: Jordan Rose <jordan_rose@apple.com>
Date: Thu, 24 Jan 2013 20:50:46 +0000
Subject: [PATCH] Handle universal character names and Unicode characters
 outside of literals.

This is a missing piece for C99 conformance.

This patch handles UCNs by adding a '\\' case to LexTokenInternal and
LexIdentifier -- if we see a backslash, we tentatively try to read in a UCN.
If the UCN is not syntactically well-formed, we fall back to the old
treatment: a backslash followed by an identifier beginning with 'u' (or 'U').

Because the spelling of an identifier with UCNs still has the UCN in it, we
need to convert that to UTF-8 in Preprocessor::LookUpIdentifierInfo.

Of course, valid code that does *not* use UCNs will see only a very minimal
performance hit (checks after each identifier for non-ASCII characters,
checks when converting raw_identifiers to identifiers that they do not
contain UCNs, and checks when getting the spelling of an identifier that it
does not contain a UCN).

This patch also adds basic support for actual UTF-8 in the source. This is
treated almost exactly the same as UCNs except that we consider stray
Unicode characters to be mistakes and offer a fixit to remove them.

llvm-svn: 173369
---
 clang/include/clang/Basic/ConvertUTF.h            |  10 +
 clang/include/clang/Basic/DiagnosticLexKinds.td   |  34 ++-
 clang/include/clang/Lex/Lexer.h                   |  20 ++
 clang/include/clang/Lex/Token.h                   |   8 +-
 clang/lib/Lex/Lexer.cpp                           | 288 +++++++++++++++++++++-
 clang/lib/Lex/Preprocessor.cpp                    |  60 ++++-
 clang/test/CXX/over/over.oper/over.literal/p8.cpp |   3 +-
 clang/test/CodeGen/ucn-identifiers.c              |  14 ++
 clang/test/FixIt/fixit-unicode.c                  |  12 +-
 clang/test/Lexer/utf8-invalid.c                   |   6 +
 clang/test/Preprocessor/ucn-pp-identifier.c       |  97 ++++++++
 clang/test/Sema/ucn-identifiers.c                 |  35 +++
 12 files changed, 554 insertions(+), 33 deletions(-)
 create mode 100644 clang/test/CodeGen/ucn-identifiers.c
 create mode 100644 clang/test/Lexer/utf8-invalid.c
 create mode 100644 clang/test/Preprocessor/ucn-pp-identifier.c
 create mode 100644 clang/test/Sema/ucn-identifiers.c
diff --git a/clang/include/clang/Basic/ConvertUTF.h b/clang/include/clang/Basic/ConvertUTF.h
index fb05afd..38956ee 100644
--- a/clang/include/clang/Basic/ConvertUTF.h
+++ b/clang/include/clang/Basic/ConvertUTF.h
@@ -161,6 +161,16 @@ Boolean isLegalUTF8String(const UTF8 **source, const UTF8 *sourceEnd);
 
 unsigned getNumBytesForUTF8(UTF8 firstByte);
 
+static inline ConversionResult convertUTF8Sequence(const UTF8 **source,
+                                                   const UTF8 *sourceEnd,
+                                                   UTF32 *target,
+                                                   ConversionFlags flags) {
+  unsigned size = getNumBytesForUTF8(**source);
+  if (size > sourceEnd - *source)
+    return sourceExhausted;
+  return ConvertUTF8toUTF32(source, *source + size, &target, target + 1, flags);
+}
+
 #ifdef __cplusplus
 }
 
diff --git a/clang/include/clang/Basic/DiagnosticLexKinds.td b/clang/include/clang/Basic/DiagnosticLexKinds.td
index c8b4423..00b385ef 100644
--- a/clang/include/clang/Basic/DiagnosticLexKinds.td
+++ b/clang/include/clang/Basic/DiagnosticLexKinds.td
@@ -93,15 +93,29 @@ def ext_multichar_character_literal : ExtWarn<
   "multi-character character constant">, InGroup<MultiChar>;
 def ext_four_char_character_literal : Extension<
   "multi-character character constant">, InGroup<FourByteMultiChar>;
-  
 
-// Literal
-def ext_nonstandard_escape : Extension<
-  "use of non-standard escape character '\\%0'">;
-def ext_unknown_escape : ExtWarn<"unknown escape sequence '\\%0'">;
-def err_hex_escape_no_digits : Error<"\\%0 used with no following hex digits">;
+
+// Unicode and UCNs
+def err_invalid_utf8 : Error<
+  "source file is not valid UTF-8">;
+def err_non_ascii : Error<
+  "non-ASCII characters are not allowed outside of literals and identifiers">;
+def ext_unicode_whitespace : ExtWarn<
+  "treating Unicode character as whitespace">,
+  InGroup<DiagGroup<"unicode-whitespace">>;
+
+def err_hex_escape_no_digits : Error<
+  "\\%0 used with no following hex digits">;
+def warn_ucn_escape_no_digits : Warning<
+  "\\%0 used with no following hex digits; "
+  "treating as '\\' followed by identifier">, InGroup<Unicode>;
+def err_ucn_escape_incomplete : Error<
+  "incomplete universal character name">;
+def warn_ucn_escape_incomplete : Warning<
+  "incomplete universal character name; "
+  "treating as '\\' followed by identifier">, InGroup<Unicode>;
 def err_ucn_escape_invalid : Error<"invalid universal character">;
-def err_ucn_escape_incomplete : Error<"incomplete universal character name">;
+
 def err_ucn_escape_basic_scs : Error<
   "character '%0' cannot be specified by a universal character name">;
 def err_ucn_control_character : Error<
@@ -112,6 +126,12 @@ def warn_cxx98_compat_literal_ucn_escape_basic_scs : Warning<
 def warn_cxx98_compat_literal_ucn_control_character : Warning<
   "universal character name referring to a control character "
   "is incompatible with C++98">, InGroup<CXX98Compat>, DefaultIgnore;
+
+
+// Literal
+def ext_nonstandard_escape : Extension<
+  "use of non-standard escape character '\\%0'">;
+def ext_unknown_escape : ExtWarn<"unknown escape sequence '\\%0'">;
 def err_invalid_decimal_digit : Error<"invalid digit '%0' in decimal constant">;
 def err_invalid_binary_digit : Error<"invalid digit '%0' in binary constant">;
 def err_invalid_octal_digit : Error<"invalid digit '%0' in octal constant">;
diff --git a/clang/include/clang/Lex/Lexer.h b/clang/include/clang/Lex/Lexer.h
index d36189f..535baf5 100644
--- a/clang/include/clang/Lex/Lexer.h
+++ b/clang/include/clang/Lex/Lexer.h
@@ -437,6 +437,11 @@ private:
   ///
   void LexTokenInternal(Token &Result);
 
+  /// Given that a token begins with the Unicode character \p C, figure out
+  /// what kind of token it is and dispatch to the appropriate lexing helper
+  /// function.
+  void LexUnicode(Token &Result, uint32_t C, const char *CurPtr);
+
   /// FormTokenWithChars - When we lex a token, we have identified a span
   /// starting at BufferPtr, going to TokEnd that forms the token.  This method
   /// takes that range and assigns it to the token as its location and size.  In
@@ -579,6 +584,21 @@ private:
   void cutOffLexing() { BufferPtr = BufferEnd; }
 
   bool isHexaLiteral(const char *Start, const LangOptions &LangOpts);
+
+
+  /// Read a universal character name.
+  ///
+  /// \param CurPtr The position in the source buffer after the initial '\'.
+  ///               If the UCN is syntactically well-formed (but not necessarily
+  ///               valid), this parameter will be updated to point to the
+  ///               character after the UCN.
+  /// \param SlashLoc The position in the source buffer of the '\'.
+  /// \param Tok The token being formed. Pass \c NULL to suppress diagnostics
+  ///            and handle token formation in the caller.
+  ///
+  /// \return The Unicode codepoint specified by the UCN, or 0 if the UCN is
+  ///         invalid.
+  uint32_t tryReadUCN(const char *&CurPtr, const char *SlashLoc, Token *Tok);
 };
 
 
diff --git a/clang/include/clang/Lex/Token.h b/clang/include/clang/Lex/Token.h
index 06ff56e..bcbe9c9 100644
--- a/clang/include/clang/Lex/Token.h
+++ b/clang/include/clang/Lex/Token.h
@@ -74,9 +74,10 @@ public:
     StartOfLine   = 0x01,  // At start of line or only after whitespace.
     LeadingSpace  = 0x02,  // Whitespace exists before this token.
     DisableExpand = 0x04,  // This identifier may never be macro expanded.
-    NeedsCleaning = 0x08,   // Contained an escaped newline or trigraph.
+    NeedsCleaning = 0x08,  // Contained an escaped newline or trigraph.
     LeadingEmptyMacro = 0x10, // Empty macro exists before this token.
-    HasUDSuffix = 0x20     // This string or character literal has a ud-suffix.
+    HasUDSuffix = 0x20,    // This string or character literal has a ud-suffix.
+    HasUCN = 0x40          // This identifier contains a UCN.
   };
 
   tok::TokenKind getKind() const { return (tok::TokenKind)Kind; }
@@ -257,6 +258,9 @@ public:
   /// \brief Return true if this token is a string or character literal which
   /// has a ud-suffix.
   bool hasUDSuffix() const { return (Flags & HasUDSuffix) ? true : false; }
+
+  /// Returns true if this token contains a universal character name.
+  bool hasUCN() const { return (Flags & HasUCN) ? true : false; }
 };
 
 /// \brief Information about the conditional stack (\#if directives)
diff --git a/clang/lib/Lex/Lexer.cpp b/clang/lib/Lex/Lexer.cpp
index 2bd95ab..e6ffca9 100644
--- a/clang/lib/Lex/Lexer.cpp
+++ b/clang/lib/Lex/Lexer.cpp
@@ -25,11 +25,13 @@
 //===----------------------------------------------------------------------===//
 
 #include "clang/Lex/Lexer.h"
+#include "clang/Basic/ConvertUTF.h"
 #include "clang/Basic/SourceManager.h"
 #include "clang/Lex/CodeCompletionHandler.h"
 #include "clang/Lex/LexDiagnostic.h"
 #include "clang/Lex/Preprocessor.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/MemoryBuffer.h"
@@ -371,10 +373,12 @@ unsigned Lexer::getSpelling(const Token &Tok, const char *&Buffer,
   // NOTE: this has to be checked *before* testing for an IdentifierInfo.
   if (Tok.is(tok::raw_identifier))
     TokStart = Tok.getRawIdentifierData();
-  else if (const IdentifierInfo *II = Tok.getIdentifierInfo()) {
-    // Just return the string from the identifier table, which is very quick.
-    Buffer = II->getNameStart();
-    return II->getLength();
+  else if (!Tok.hasUCN()) {
+    if (const IdentifierInfo *II = Tok.getIdentifierInfo()) {
+      // Just return the string from the identifier table, which is very quick.
+      Buffer = II->getNameStart();
+      return II->getLength();
+    }
   }
 
   // NOTE: this can be checked even after testing for an IdentifierInfo.
@@ -1376,7 +1380,6 @@ SourceLocation Lexer::findLocationAfterToken(SourceLocation Loc,
 ///   2. If this is an escaped newline (potentially with whitespace between
 ///      the backslash and newline), implicitly skip the newline and return
 ///      the char after it.
-///   3. If this is a UCN, return it.  FIXME: C++ UCN's?
 ///
 /// This handles the slow/uncommon case of the getCharAndSize method.  Here we
 /// know that we can accumulate into Size, and that we have already incremented
@@ -1509,6 +1512,77 @@ void Lexer::SkipBytes(unsigned Bytes, bool StartOfLine) {
   IsAtStartOfLine = StartOfLine;
 }
 
+namespace {
+  struct UCNCharRange {
+    uint32_t Lower;
+    uint32_t Upper;
+  };
+  
+  // C11 D.1, C++11 [charname.allowed]
+  // FIXME: C99 and C++03 each have a different set of allowed UCNs.
+  const UCNCharRange UCNAllowedCharRanges[] = {
+    // 1
+    { 0x00A8, 0x00A8 }, { 0x00AA, 0x00AA }, { 0x00AD, 0x00AD },
+    { 0x00AF, 0x00AF }, { 0x00B2, 0x00B5 }, { 0x00B7, 0x00BA },
+    { 0x00BC, 0x00BE }, { 0x00C0, 0x00D6 }, { 0x00D8, 0x00F6 },
+    { 0x00F8, 0x00FF },
+    // 2
+    { 0x0100, 0x167F }, { 0x1681, 0x180D }, { 0x180F, 0x1FFF },
+    // 3
+    { 0x200B, 0x200D }, { 0x202A, 0x202E }, { 0x203F, 0x2040 },
+    { 0x2054, 0x2054 }, { 0x2060, 0x206F },
+    // 4
+    { 0x2070, 0x218F }, { 0x2460, 0x24FF }, { 0x2776, 0x2793 },
+    { 0x2C00, 0x2DFF }, { 0x2E80, 0x2FFF },
+    // 5
+    { 0x3004, 0x3007 }, { 0x3021, 0x302F }, { 0x3031, 0x303F },
+    // 6
+    { 0x3040, 0xD7FF },
+    // 7
+    { 0xF900, 0xFD3D }, { 0xFD40, 0xFDCF }, { 0xFDF0, 0xFE44 },
+    { 0xFE47, 0xFFFD },
+    // 8
+    { 0x10000, 0x1FFFD }, { 0x20000, 0x2FFFD }, { 0x30000, 0x3FFFD },
+    { 0x40000, 0x4FFFD }, { 0x50000, 0x5FFFD }, { 0x60000, 0x6FFFD },
+    { 0x70000, 0x7FFFD }, { 0x80000, 0x8FFFD }, { 0x90000, 0x9FFFD },
+    { 0xA0000, 0xAFFFD }, { 0xB0000, 0xBFFFD }, { 0xC0000, 0xCFFFD },
+    { 0xD0000, 0xDFFFD }, { 0xE0000, 0xEFFFD }
+  };
+}
+
+static bool isAllowedIDChar(uint32_t c) {
+  unsigned LowPoint = 0;
+  unsigned HighPoint = llvm::array_lengthof(UCNAllowedCharRanges);
+
+  // Binary search the UCNAllowedCharRanges set.
+  while (HighPoint != LowPoint) {
+    unsigned MidPoint = (HighPoint + LowPoint) / 2;
+    if (c < UCNAllowedCharRanges[MidPoint].Lower)
+      HighPoint = MidPoint;
+    else if (c > UCNAllowedCharRanges[MidPoint].Upper)
+      LowPoint = MidPoint + 1;
+    else
+      return true;
+  }
+
+  return false;
+}
+
+static bool isAllowedInitiallyIDChar(uint32_t c) {
+  // C11 D.2, C++11 [charname.disallowed]
+  // FIXME: C99 only forbids "digits", presumably as described in C99 Annex D.
+  // FIXME: C++03 does not forbid any initial characters.
+  return !(0x0300 <= c && c <= 0x036F) &&
+         !(0x1DC0 <= c && c <= 0x1DFF) &&
+         !(0x20D0 <= c && c <= 0x20FF) &&
+         !(0xFE20 <= c && c <= 0xFE2F);
+}
+
+static inline bool isASCII(char C) {
+  return static_cast<signed char>(C) >= 0;
+}
+
+
 void Lexer::LexIdentifier(Token &Result, const char *CurPtr) {
   // Match [_A-Za-z0-9]*, we have already matched [_A-Za-z$]
   unsigned Size;
@@ -1520,11 +1594,11 @@ void Lexer::LexIdentifier(Token &Result, const char *CurPtr) {
 
   // Fast path, no $,\,? in identifier found.  '\' might be an escaped newline
   // or UCN, and ? might be a trigraph for '\', an escaped newline or UCN.
-  // FIXME: UCNs.
   //
   // TODO: Could merge these checks into a CharInfo flag to make the comparison
   // cheaper
-  if (C != '\\' && C != '?' && (C != '$' || !LangOpts.DollarIdents)) {
+  if (isASCII(C) && C != '\\' && C != '?' &&
+      (C != '$' || !LangOpts.DollarIdents)) {
 FinishIdentifier:
     const char *IdStart = BufferPtr;
     FormTokenWithChars(Result, CurPtr, tok::raw_identifier);
@@ -1561,8 +1635,38 @@ FinishIdentifier:
       CurPtr = ConsumeChar(CurPtr, Size, Result);
       C = getCharAndSize(CurPtr, Size);
       continue;
-    } else if (!isIdentifierBody(C)) { // FIXME: UCNs.
-      // Found end of identifier.
+
+    } else if (C == '\\') {
+      const char *UCNPtr = CurPtr + Size;
+      uint32_t CodePoint = tryReadUCN(UCNPtr, CurPtr, /*Token=*/0);
+      if (CodePoint == 0 || !isAllowedIDChar(CodePoint))
+        goto FinishIdentifier;
+
+      Result.setFlag(Token::HasUCN);
+      if ((UCNPtr - CurPtr ==  6 && CurPtr[1] == 'u') ||
+          (UCNPtr - CurPtr == 10 && CurPtr[1] == 'U'))
+        CurPtr = UCNPtr;
+      else
+        while (CurPtr != UCNPtr)
+          (void)getAndAdvanceChar(CurPtr, Result);
+
+      C = getCharAndSize(CurPtr, Size);
+      continue;
+    } else if (!isASCII(C)) {
+      const char *UnicodePtr = CurPtr;
+      UTF32 CodePoint;
+      ConversionResult Result = convertUTF8Sequence((const UTF8 **)&UnicodePtr,
+                                                    (const UTF8 *)BufferEnd,
+                                                    &CodePoint,
+                                                    strictConversion);
+      if (Result != conversionOK ||
+          !isAllowedIDChar(static_cast<uint32_t>(CodePoint)))
+        goto FinishIdentifier;
+
+      CurPtr = UnicodePtr;
+      C = getCharAndSize(CurPtr, Size);
+      continue;
+    } else if (!isIdentifierBody(C)) {
       goto FinishIdentifier;
     }
 
@@ -1570,7 +1674,7 @@ FinishIdentifier:
     CurPtr = ConsumeChar(CurPtr, Size, Result);
 
     C = getCharAndSize(CurPtr, Size);
-    while (isIdentifierBody(C)) { // FIXME: UCNs.
+    while (isIdentifierBody(C)) {
       CurPtr = ConsumeChar(CurPtr, Size, Result);
       C = getCharAndSize(CurPtr, Size);
     }
@@ -2592,6 +2696,135 @@ bool Lexer::isCodeCompletionPoint(const char *CurPtr) const {
   return false;
 }
 
+uint32_t Lexer::tryReadUCN(const char *&StartPtr, const char *SlashLoc,
+                           Token *Result) {
+  assert(LangOpts.CPlusPlus || LangOpts.C99);
+
+  unsigned CharSize;
+  char Kind = getCharAndSize(StartPtr, CharSize);
+
+  unsigned NumHexDigits;
+  if (Kind == 'u')
+    NumHexDigits = 4;
+  else if (Kind == 'U')
+    NumHexDigits = 8;
+  else
+    return 0;
+
+  const char *CurPtr = StartPtr + CharSize;
+  const char *KindLoc = &CurPtr[-1];
+
+  uint32_t CodePoint = 0;
+  for (unsigned i = 0; i < NumHexDigits; ++i) {
+    char C = getCharAndSize(CurPtr, CharSize);
+
+    unsigned Value = llvm::hexDigitValue(C);
+    if (Value == -1U) {
+      if (Result && !isLexingRawMode()) {
+        if (i == 0) {
+          Diag(BufferPtr, diag::warn_ucn_escape_no_digits)
+            << StringRef(KindLoc, 1);
+        } else {
+          // FIXME: if i == 4 and NumHexDigits == 8, suggest a fixit to \u.
+          Diag(BufferPtr, diag::warn_ucn_escape_incomplete);
+        }
+      }
+      
+      return 0;
+    }
+
+    CodePoint <<= 4;
+    CodePoint += Value;
+
+    CurPtr += CharSize;
+  }
+
+  if (Result) {
+    Result->setFlag(Token::HasUCN);
+    if (CurPtr - StartPtr == NumHexDigits + 2)
+      StartPtr = CurPtr;
+    else
+      while (StartPtr != CurPtr)
+        (void)getAndAdvanceChar(StartPtr, *Result);
+  } else {
+    StartPtr = CurPtr;
+  }
+
+  // C99 6.4.3p2: A universal character name shall not specify a character whose
+  //   short identifier is less than 00A0 other than 0024 ($), 0040 (@), or
+  //   0060 (`), nor one in the range D800 through DFFF inclusive.)
+  // C++11 [lex.charset]p2: If the hexadecimal value for a
+  //   universal-character-name corresponds to a surrogate code point (in the
+  //   range 0xD800-0xDFFF, inclusive), the program is ill-formed. Additionally,
+  //   if the hexadecimal value for a universal-character-name outside the
+  //   c-char-sequence, s-char-sequence, or r-char-sequence of a character or
+  //   string literal corresponds to a control character (in either of the
+  //   ranges 0x00-0x1F or 0x7F-0x9F, both inclusive) or to a character in the
+  //   basic source character set, the program is ill-formed.
+  if (CodePoint < 0xA0) {
+    if (CodePoint == 0x24 || CodePoint == 0x40 || CodePoint == 0x60)
+      return CodePoint;
+
+    // We don't use isLexingRawMode() here because we need to warn about bad
+    // UCNs even when skipping preprocessing tokens in a #if block.
+    if (Result && PP) {
+      if (CodePoint < 0x20 || CodePoint >= 0x7F)
+        Diag(BufferPtr, diag::err_ucn_control_character);
+      else {
+        char C = static_cast<char>(CodePoint);
+        Diag(BufferPtr, diag::err_ucn_escape_basic_scs) << StringRef(&C, 1);
+      }
+    }
+
+    return 0;
+    
+  } else if ((!LangOpts.CPlusPlus || LangOpts.CPlusPlus11) &&
+             (CodePoint >= 0xD800 && CodePoint <= 0xDFFF)) {
+    // C++03 allows UCNs representing surrogate characters. C99 and C++11 don't.
+    // We don't use isLexingRawMode() here because we need to warn about bad
+    // UCNs even when skipping preprocessing tokens in a #if block.
+    if (Result && PP)
+      Diag(BufferPtr, diag::err_ucn_escape_invalid);
+    return 0;
+  }
+
+  return CodePoint;
+}
+
+void Lexer::LexUnicode(Token &Result, uint32_t C, const char *CurPtr) {
+  if (isAllowedIDChar(C) && isAllowedInitiallyIDChar(C)) {
+    MIOpt.ReadToken();
+    return LexIdentifier(Result, CurPtr);
+  }
+
+  if (!isASCII(*BufferPtr) && !isAllowedIDChar(C)) {
+    // Non-ASCII characters tend to creep into source code unintentionally.
+    // Instead of letting the parser complain about the unknown token,
+    // just drop the character.
+    // Note that we can /only/ do this when the non-ASCII character is actually
+    // spelled as Unicode, not written as a UCN. The standard requires that
+    // we not throw away any possible preprocessor tokens, but there's a
+    // loophole in the mapping of Unicode characters to basic character set
+    // characters that allows us to map these particular characters to, say,
+    // whitespace.
+    if (!isLexingRawMode()) {
+      CharSourceRange CharRange =
+        CharSourceRange::getCharRange(getSourceLocation(),
+                                      getSourceLocation(CurPtr));
+      Diag(BufferPtr, diag::err_non_ascii)
+        << FixItHint::CreateRemoval(CharRange);
+    }
+
+    BufferPtr = CurPtr;
+    return LexTokenInternal(Result);
+  }
+
+  // Otherwise, we have an explicit UCN or a character that's unlikely to show
+  // up by accident.
+  MIOpt.ReadToken();
+  FormTokenWithChars(Result, CurPtr, tok::unknown);
+}
+
 
 /// LexTokenInternal - This implements a simple C family lexer.  It is an
 /// extremely performance critical piece of code.  This assumes that the buffer
@@ -3243,12 +3476,41 @@ LexNextToken:
       Kind = tok::unknown;
     break;
 
+  // UCNs (C99 6.4.3, C++11 [lex.charset]p2)
   case '\\':
-    // FIXME: UCN's.
-    // FALL THROUGH.
-  default:
+    if (uint32_t CodePoint = tryReadUCN(CurPtr, BufferPtr, &Result))
+      return LexUnicode(Result, CodePoint, CurPtr);
+
     Kind = tok::unknown;
     break;
+
+  default: {
+    if (isASCII(Char)) {
+      Kind = tok::unknown;
+      break;
+    }
+
+    UTF32 CodePoint;
+
+    // We can't just reset CurPtr to BufferPtr because BufferPtr may point to
+    // an escaped newline.
+    --CurPtr;
+    ConversionResult Status = convertUTF8Sequence((const UTF8 **)&CurPtr,
+                                                  (const UTF8 *)BufferEnd,
+                                                  &CodePoint,
+                                                  strictConversion);
+    if (Status == conversionOK)
+      return LexUnicode(Result, CodePoint, CurPtr);
+    
+    // Non-ASCII characters tend to creep into source code unintentionally.
+    // Instead of letting the parser complain about the unknown token,
+    // just warn that we don't have valid UTF-8, then drop the character.
+    if (!isLexingRawMode())
+      Diag(CurPtr, diag::err_invalid_utf8);
+
+    BufferPtr = CurPtr+1;
+    goto LexNextToken;
+  }
   }
 
   // Notify MIOpt that we read a non-whitespace/non-comment token.
diff --git a/clang/lib/Lex/Preprocessor.cpp b/clang/lib/Lex/Preprocessor.cpp
index c01019c..b933a5f 100644
--- a/clang/lib/Lex/Preprocessor.cpp
+++ b/clang/lib/Lex/Preprocessor.cpp
@@ -27,6 +27,7 @@
 
 #include "clang/Lex/Preprocessor.h"
 #include "MacroArgs.h"
+#include "clang/Basic/ConvertUTF.h"
 #include "clang/Basic/FileManager.h"
 #include "clang/Basic/SourceManager.h"
 #include "clang/Basic/TargetInfo.h"
@@ -43,6 +44,8 @@
 #include "clang/Lex/ScratchBuffer.h"
 #include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/Support/Capacity.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/raw_ostream.h"
@@ -396,7 +399,7 @@ StringRef Preprocessor::getSpelling(const Token &Tok,
                                           SmallVectorImpl<char> &Buffer,
                                           bool *Invalid) const {
   // NOTE: this has to be checked *before* testing for an IdentifierInfo.
-  if (Tok.isNot(tok::raw_identifier)) {
+  if (Tok.isNot(tok::raw_identifier) && !Tok.hasUCN()) {
     // Try the fast path.
     if (const IdentifierInfo *II = Tok.getIdentifierInfo())
       return II->getName();
@@ -494,6 +497,48 @@ void Preprocessor::EndSourceFile() {
 // Lexer Event Handling.
 //===----------------------------------------------------------------------===//
 
+static void appendCodePoint(unsigned Codepoint,
+                            llvm::SmallVectorImpl<char> &Str) {
+  char ResultBuf[4];
+  char *ResultPtr = ResultBuf;
+  bool Res = ConvertCodePointToUTF8(Codepoint, ResultPtr);
+  (void)Res;
+  assert(Res && "Unexpected conversion failure");
+  Str.append(ResultBuf, ResultPtr);
+}
+
+static void expandUCNs(SmallVectorImpl<char> &Buf, StringRef Input) {
+  for (StringRef::iterator I = Input.begin(), E = Input.end(); I != E; ++I) {
+    if (*I != '\\') {
+      Buf.push_back(*I);
+      continue;
+    }
+
+    ++I;
+    assert(*I == 'u' || *I == 'U');
+
+    unsigned NumHexDigits;
+    if (*I == 'u')
+      NumHexDigits = 4;
+    else
+      NumHexDigits = 8;
+
+    assert(I + NumHexDigits <= E);
+
+    uint32_t CodePoint = 0;
+    for (++I; NumHexDigits != 0; ++I, --NumHexDigits) {
+      unsigned Value = llvm::hexDigitValue(*I);
+      assert(Value != -1U);
+
+      CodePoint <<= 4;
+      CodePoint += Value;
+    }
+
+    appendCodePoint(CodePoint, Buf);
+    --I;
+  }
+}
+
 /// LookUpIdentifierInfo - Given a tok::raw_identifier token, look up the
 /// identifier information for the token and install it into the token,
 /// updating the token kind accordingly.
@@ -502,15 +547,22 @@ IdentifierInfo *Preprocessor::LookUpIdentifierInfo(Token &Identifier) const {
 
   // Look up this token, see if it is a macro, or if it is a language keyword.
   IdentifierInfo *II;
-  if (!Identifier.needsCleaning()) {
+  if (!Identifier.needsCleaning() && !Identifier.hasUCN()) {
     // No cleaning needed, just use the characters from the lexed buffer.
     II = getIdentifierInfo(StringRef(Identifier.getRawIdentifierData(),
-                                           Identifier.getLength()));
+                                     Identifier.getLength()));
   } else {
     // Cleaning needed, alloca a buffer, clean into it, then use the buffer.
     SmallString<64> IdentifierBuffer;
     StringRef CleanedStr = getSpelling(Identifier, IdentifierBuffer);
-    II = getIdentifierInfo(CleanedStr);
+
+    if (Identifier.hasUCN()) {
+      SmallString<64> UCNIdentifierBuffer;
+      expandUCNs(UCNIdentifierBuffer, CleanedStr);
+      II = getIdentifierInfo(UCNIdentifierBuffer);
+    } else {
+      II = getIdentifierInfo(CleanedStr);
+    }
   }
 
   // Update the token info (identifier info and appropriate token kind).
diff --git a/clang/test/CXX/over/over.oper/over.literal/p8.cpp b/clang/test/CXX/over/over.oper/over.literal/p8.cpp
index 6f63610..70a1843 100644
--- a/clang/test/CXX/over/over.oper/over.literal/p8.cpp
+++ b/clang/test/CXX/over/over.oper/over.literal/p8.cpp
@@ -7,8 +7,7 @@ namespace std {
 
 void operator "" _km(long double); // ok
 string operator "" _i18n(const char*, std::size_t); // ok
-// FIXME: This should be accepted once we support UCNs
-template<char...> int operator "" \u03C0(); // ok, UCN for lowercase pi // expected-error {{expected identifier}}
+template<char...> int operator "" \u03C0(); // ok, UCN for lowercase pi // expected-warning {{reserved}}
 float operator ""E(const char *); // expected-error {{invalid suffix on literal}} expected-warning {{reserved}}
 float operator " " B(const char *); // expected-error {{must be '""'}} expected-warning {{reserved}}
 string operator "" 5X(const char *, std::size_t); // expected-error {{expected identifier}}
diff --git a/clang/test/CodeGen/ucn-identifiers.c b/clang/test/CodeGen/ucn-identifiers.c
new file mode 100644
index 0000000..56e3aa5
--- /dev/null
+++ b/clang/test/CodeGen/ucn-identifiers.c
@@ -0,0 +1,14 @@
+// RUN: %clang_cc1 %s -emit-llvm -o /dev/null
+// RUN: %clang_cc1 %s -emit-llvm -o /dev/null -x c++
+// This file contains UTF-8; please do not fix!
+
+
+extern void \u00FCber(int);
+extern void \U000000FCber(int); // redeclaration, no warning
+
+void goodCalls() {
+  \u00FCber(0);
+  \u00fcber(1);
+  Ã¼ber(2);
+  \U000000FCber(3);
+}
diff --git a/clang/test/FixIt/fixit-unicode.c b/clang/test/FixIt/fixit-unicode.c
index 2af5e08..c45ba06 100644
--- a/clang/test/FixIt/fixit-unicode.c
+++ b/clang/test/FixIt/fixit-unicode.c
@@ -8,13 +8,15 @@ struct Foo {
 // PR13312
 void test1() {
   struct Foo foo;
-  (&foo)â>bar = 42;
+  foo.bar = 42â
+// CHECK: error: non-ASCII characters are not allowed outside of literals and identifiers
+// CHECK: {{^              \^}}
 // CHECK: error: expected ';' after expression
 // Make sure we emit the fixit right in front of the snowman.
-// CHECK: {{^        \^}}
-// CHECK: {{^        ;}}
+// CHECK: {{^              \^}}
+// CHECK: {{^              ;}}
 
-// CHECK-MACHINE: fix-it:"{{.*}}fixit-unicode.c":{11:9-11:9}:";"
+// CHECK-MACHINE: fix-it:"{{.*}}fixit-unicode.c":{[[@LINE-8]]:15-[[@LINE-8]]:15}:";"
 }
 
 
@@ -29,5 +31,5 @@ void test2() {
 // because different systems will render the delta differently (either as a
 // character, or as <U+2206>.) The fixit should line up with the %d regardless.
 
-// CHECK-MACHINE: fix-it:"{{.*}}fixit-unicode.c":{23:16-23:18}:"%ld"
+// CHECK-MACHINE: fix-it:"{{.*}}fixit-unicode.c":{[[@LINE-9]]:16-[[@LINE-9]]:18}:"%ld"
 }
diff --git a/clang/test/Lexer/utf8-invalid.c b/clang/test/Lexer/utf8-invalid.c
new file mode 100644
index 0000000..c4dd318
--- /dev/null
+++ b/clang/test/Lexer/utf8-invalid.c
@@ -0,0 +1,6 @@
+// RUN: %clang_cc1 -fsyntax-only -verify %s
+
+// Note: this file contains invalid UTF-8 before the variable name in the
+// next line. Please do not fix!
+
+extern int x; // expected-error{{source file is not valid UTF-8}}
diff --git a/clang/test/Preprocessor/ucn-pp-identifier.c b/clang/test/Preprocessor/ucn-pp-identifier.c
new file mode 100644
index 0000000..f4afa91
--- /dev/null
+++ b/clang/test/Preprocessor/ucn-pp-identifier.c
@@ -0,0 +1,97 @@
+// RUN: %clang_cc1 %s -fsyntax-only -std=c99 -pedantic -verify -Wundef
+// RUN: %clang_cc1 %s -fsyntax-only -x c++ -pedantic -verify -Wundef
+
+#define \u00FC
+#define a\u00FD() 0
+#ifndef \u00FC
+#error "This should never happen"
+#endif
+
+#if a\u00FD()
+#error "This should never happen"
+#endif
+
+#if a\U000000FD()
+#error "This should never happen"
+#endif
+
+#if \uarecool // expected-warning{{incomplete universal character name; treating as '\' followed by identifier}} expected-error {{invalid token at start of a preprocessor expression}}
+#endif
+#if \uwerecool // expected-warning{{\u used with no following hex digits; treating as '\' followed by identifier}} expected-error {{invalid token at start of a preprocessor expression}}
+#endif
+#if \U0001000  // expected-warning{{incomplete universal character name; treating as '\' followed by identifier}} expected-error {{invalid token at start of a preprocessor expression}}
+#endif
+
+// Make sure we reject disallowed UCNs
+#define \ufffe // expected-error {{macro names must be identifiers}}
+#define \U10000000  // expected-error {{macro names must be identifiers}}
+#define \u0061  // expected-error {{character 'a' cannot be specified by a universal character name}} expected-error {{macro names must be identifiers}}
+
+// FIXME: Not clear what our behavior should be here; \u0024 is "$".
+#define a\u0024  // expected-warning {{whitespace}}
+
+#if \u0110 // expected-warning {{is not defined, evaluates to 0}}
+#endif
+
+
+#define \u0110 1 / 0
+#if \u0110 // expected-error {{division by zero in preprocessor expression}}
+#endif
+
+#define STRINGIZE(X) # X
+
+extern int check_size[sizeof(STRINGIZE(\u0112)) == 3 ? 1 : -1];
+
+// Check that we still diagnose disallowed UCNs in #if 0 blocks.
+// C99 5.1.1.2p1 and C++11 [lex.phases]p1 dictate that preprocessor tokens are
+// formed before directives are parsed.
+// expected-error@+4 {{character 'a' cannot be specified by a universal character name}}
+#if 0
+#define \ufffe // okay
+#define \U10000000 // okay
+#define \u0061 // error, but -verify only looks at comments outside #if 0
+#endif
+
+
+// A UCN formed by token pasting is undefined in both C99 and C++.
+// Right now we don't do anything special, which causes us to coincidentally
+// accept the first case below but reject the second two.
+#define PASTE(A, B) A ## B
+extern int PASTE(\, u00FD);
+extern int PASTE(\u, 00FD); // expected-warning{{\u used with no following hex digits}}
+extern int PASTE(\u0, 0FD); // expected-warning{{incomplete universal character name}}
+#ifdef __cplusplus
+// expected-error@-3 {{expected unqualified-id}}
+// expected-error@-3 {{expected unqualified-id}}
+#else
+// expected-error@-6 {{expected identifier}}
+// expected-error@-6 {{expected identifier}}
+#endif
+
+
+// A UCN produced by line splicing is valid in C99 but undefined in C++.
+// Since undefined behavior can do anything including working as intended,
+// we just accept it in C++ as well.;
+#define newline_1_\u00F\
+C 1
+#define newline_2_\u00\
+F\
+C 1
+#define newline_3_\u\
+00\
+FC 1
+#define newline_4_\\
+u00FC 1
+#define newline_5_\\
+u\
+\
+0\
+0\
+F\
+C 1
+
+#if (newline_1_\u00FC && newline_2_\u00FC && newline_3_\u00FC && \
+     newline_4_\u00FC && newline_5_\u00FC)
+#else
+#error "Line splicing failed to produce UCNs"
+#endif
diff --git a/clang/test/Sema/ucn-identifiers.c b/clang/test/Sema/ucn-identifiers.c
new file mode 100644
index 0000000..6b263658
--- /dev/null
+++ b/clang/test/Sema/ucn-identifiers.c
@@ -0,0 +1,35 @@
+// RUN: %clang_cc1 %s -verify -fsyntax-only -pedantic
+// RUN: %clang_cc1 %s -verify -fsyntax-only -x c++ -pedantic
+
+// This file contains UTF-8; please do not fix!
+
+
+extern void \u00FCber(int);
+extern void \U000000FCber(int); // redeclaration, no warning
+#ifdef __cplusplus
+// expected-note@-2 + {{candidate function not viable}}
+#else
+// expected-note@-4 + {{declared here}}
+#endif
+
+void goodCalls() {
+  \u00FCber(0);
+  \u00fcber(1);
+  Ã¼ber(2);
+  \U000000FCber(3);
+}
+
+void badCalls() {
+  \u00FCber(0.5); // expected-warning{{implicit conversion from 'double' to 'int'}}
+  \u00fcber = 0; // expected-error{{non-object type 'void (int)' is not assignable}}
+
+  Ã¼ber(1, 2);
+  \U000000FCber(); 
+#ifdef __cplusplus
+  // expected-error@-3 {{no matching function}}
+  // expected-error@-3 {{no matching function}}
+#else
+  // expected-error@-6 {{too many arguments to function call, expected 1, have 2}}
+  // expected-error@-6 {{too few arguments to function call, expected 1, have 0}}
+#endif
+}
-- 
2.7.4