From 3e3a705062fe343dc397c0bf623aa383f14ce07c Mon Sep 17 00:00:00 2001 From: Richard Smith Date: Sat, 8 Nov 2014 06:08:42 +0000 Subject: [PATCH] [c++1z] Support for u8 character literals. llvm-svn: 221576 --- clang/include/clang/Basic/DiagnosticLexKinds.td | 3 +++ clang/include/clang/Basic/TokenKinds.def | 3 +++ clang/include/clang/Basic/TokenKinds.h | 6 +++--- clang/lib/Lex/Lexer.cpp | 20 ++++++++++++++------ clang/lib/Lex/LiteralSupport.cpp | 8 ++++++-- clang/lib/Lex/MacroArgs.cpp | 1 + clang/lib/Lex/PPExpressions.cpp | 1 + clang/lib/Lex/TokenConcatenation.cpp | 8 +++++++- clang/lib/Parse/ParseExpr.cpp | 1 + clang/lib/Parse/ParseTentative.cpp | 1 + clang/test/Lexer/utf8-char-literal.cpp | 9 +++++++++ clang/www/cxx_status.html | 9 +++++++-- 12 files changed, 56 insertions(+), 14 deletions(-) diff --git a/clang/include/clang/Basic/DiagnosticLexKinds.td b/clang/include/clang/Basic/DiagnosticLexKinds.td index 9bc27ab..2fcfa02 100644 --- a/clang/include/clang/Basic/DiagnosticLexKinds.td +++ b/clang/include/clang/Basic/DiagnosticLexKinds.td @@ -201,6 +201,9 @@ def warn_c99_compat_unicode_literal : Warning< def warn_cxx98_compat_unicode_literal : Warning< "unicode literals are incompatible with C++98">, InGroup, DefaultIgnore; +def warn_cxx14_compat_u8_character_literal : Warning< + "unicode literals are incompatible with C++ standards before C++1z">, + InGroup, DefaultIgnore; def warn_cxx11_compat_user_defined_literal : Warning< "identifier after literal will be treated as a user-defined literal suffix " "in C++11">, InGroup, DefaultIgnore; diff --git a/clang/include/clang/Basic/TokenKinds.def b/clang/include/clang/Basic/TokenKinds.def index d2b06df..c96b8eb 100644 --- a/clang/include/clang/Basic/TokenKinds.def +++ b/clang/include/clang/Basic/TokenKinds.def @@ -133,6 +133,9 @@ TOK(numeric_constant) // 0x123 TOK(char_constant) // 'a' TOK(wide_char_constant) // L'b' +// C++1z Character Constants +TOK(utf8_char_constant) // u8'a' + // C++11 Character Constants TOK(utf16_char_constant) // u'a' TOK(utf32_char_constant) // U'a' diff --git a/clang/include/clang/Basic/TokenKinds.h b/clang/include/clang/Basic/TokenKinds.h index e2cffb4..f4ecb3e 100644 --- a/clang/include/clang/Basic/TokenKinds.h +++ b/clang/include/clang/Basic/TokenKinds.h @@ -86,9 +86,9 @@ inline bool isStringLiteral(TokenKind K) { /// constant, string, etc. inline bool isLiteral(TokenKind K) { return K == tok::numeric_constant || K == tok::char_constant || - K == tok::wide_char_constant || K == tok::utf16_char_constant || - K == tok::utf32_char_constant || isStringLiteral(K) || - K == tok::angle_string_literal; + K == tok::wide_char_constant || K == tok::utf8_char_constant || + K == tok::utf16_char_constant || K == tok::utf32_char_constant || + isStringLiteral(K) || K == tok::angle_string_literal; } /// \brief Return true if this is any of tok::annot_* kinds. diff --git a/clang/lib/Lex/Lexer.cpp b/clang/lib/Lex/Lexer.cpp index 0aaad9b..c2e9716 100644 --- a/clang/lib/Lex/Lexer.cpp +++ b/clang/lib/Lex/Lexer.cpp @@ -1889,17 +1889,20 @@ bool Lexer::LexAngledStringLiteral(Token &Result, const char *CurPtr) { /// LexCharConstant - Lex the remainder of a character constant, after having -/// lexed either ' or L' or u' or U'. +/// lexed either ' or L' or u8' or u' or U'. bool Lexer::LexCharConstant(Token &Result, const char *CurPtr, tok::TokenKind Kind) { // Does this character contain the \0 character? const char *NulCharacter = nullptr; - if (!isLexingRawMode() && - (Kind == tok::utf16_char_constant || Kind == tok::utf32_char_constant)) - Diag(BufferPtr, getLangOpts().CPlusPlus - ? diag::warn_cxx98_compat_unicode_literal - : diag::warn_c99_compat_unicode_literal); + if (!isLexingRawMode()) { + if (Kind == tok::utf16_char_constant || Kind == tok::utf32_char_constant) + Diag(BufferPtr, getLangOpts().CPlusPlus + ? diag::warn_cxx98_compat_unicode_literal + : diag::warn_c99_compat_unicode_literal); + else if (Kind == tok::utf8_char_constant) + Diag(BufferPtr, diag::warn_cxx14_compat_u8_character_literal); + } char C = getAndAdvanceChar(CurPtr, Result); if (C == '\'') { @@ -3068,6 +3071,11 @@ LexNextToken: ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), SizeTmp2, Result), tok::utf8_string_literal); + if (Char2 == '\'' && LangOpts.CPlusPlus1z) + return LexCharConstant( + Result, ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), + SizeTmp2, Result), + tok::utf8_char_constant); if (Char2 == 'R' && LangOpts.CPlusPlus11) { unsigned SizeTmp3; diff --git a/clang/lib/Lex/LiteralSupport.cpp b/clang/lib/Lex/LiteralSupport.cpp index 096805c..03331fb 100644 --- a/clang/lib/Lex/LiteralSupport.cpp +++ b/clang/lib/Lex/LiteralSupport.cpp @@ -28,6 +28,7 @@ static unsigned getCharWidth(tok::TokenKind kind, const TargetInfo &Target) { default: llvm_unreachable("Unknown token type!"); case tok::char_constant: case tok::string_literal: + case tok::utf8_char_constant: case tok::utf8_string_literal: return Target.getCharWidth(); case tok::wide_char_constant: @@ -1031,9 +1032,10 @@ CharLiteralParser::CharLiteralParser(const char *begin, const char *end, const char *TokBegin = begin; // Skip over wide character determinant. - if (Kind != tok::char_constant) { + if (Kind != tok::char_constant) + ++begin; + if (Kind == tok::utf8_char_constant) ++begin; - } // Skip over the entry quote. assert(begin[0] == '\'' && "Invalid token lexed"); @@ -1077,6 +1079,8 @@ CharLiteralParser::CharLiteralParser(const char *begin, const char *end, if (tok::wide_char_constant == Kind) { largest_character_for_kind = 0xFFFFFFFFu >> (32-PP.getTargetInfo().getWCharWidth()); + } else if (tok::utf8_char_constant == Kind) { + largest_character_for_kind = 0x7F; } else if (tok::utf16_char_constant == Kind) { largest_character_for_kind = 0xFFFF; } else if (tok::utf32_char_constant == Kind) { diff --git a/clang/lib/Lex/MacroArgs.cpp b/clang/lib/Lex/MacroArgs.cpp index 0fa3239..9967f3f 100644 --- a/clang/lib/Lex/MacroArgs.cpp +++ b/clang/lib/Lex/MacroArgs.cpp @@ -218,6 +218,7 @@ Token MacroArgs::StringifyArgument(const Token *ArgToks, if (tok::isStringLiteral(Tok.getKind()) || // "foo", u8R"x(foo)x"_bar, etc. Tok.is(tok::char_constant) || // 'x' Tok.is(tok::wide_char_constant) || // L'x'. + Tok.is(tok::utf8_char_constant) || // u8'x'. Tok.is(tok::utf16_char_constant) || // u'x'. Tok.is(tok::utf32_char_constant)) { // U'x'. bool Invalid = false; diff --git a/clang/lib/Lex/PPExpressions.cpp b/clang/lib/Lex/PPExpressions.cpp index a3f5d93..9cf72cf 100644 --- a/clang/lib/Lex/PPExpressions.cpp +++ b/clang/lib/Lex/PPExpressions.cpp @@ -273,6 +273,7 @@ static bool EvaluateValue(PPValue &Result, Token &PeekTok, DefinedTracker &DT, } case tok::char_constant: // 'x' case tok::wide_char_constant: // L'x' + case tok::utf8_char_constant: // u8'x' case tok::utf16_char_constant: // u'x' case tok::utf32_char_constant: { // U'x' // Complain about, and drop, any ud-suffix. diff --git a/clang/lib/Lex/TokenConcatenation.cpp b/clang/lib/Lex/TokenConcatenation.cpp index 866cbb1..0832749 100644 --- a/clang/lib/Lex/TokenConcatenation.cpp +++ b/clang/lib/Lex/TokenConcatenation.cpp @@ -99,6 +99,10 @@ TokenConcatenation::TokenConcatenation(Preprocessor &pp) : PP(pp) { TokenInfo[tok::utf32_char_constant ] |= aci_custom; } + // These tokens have custom code in C++1z mode. + if (PP.getLangOpts().CPlusPlus1z) + TokenInfo[tok::utf8_char_constant] |= aci_custom; + // These tokens change behavior if followed by an '='. TokenInfo[tok::amp ] |= aci_avoid_equal; // &= TokenInfo[tok::plus ] |= aci_avoid_equal; // += @@ -213,6 +217,7 @@ bool TokenConcatenation::AvoidConcat(const Token &PrevPrevTok, case tok::utf32_string_literal: case tok::char_constant: case tok::wide_char_constant: + case tok::utf8_char_constant: case tok::utf16_char_constant: case tok::utf32_char_constant: if (!PP.getLangOpts().CPlusPlus11) @@ -236,7 +241,8 @@ bool TokenConcatenation::AvoidConcat(const Token &PrevPrevTok, if (Tok.getIdentifierInfo() || Tok.is(tok::wide_string_literal) || Tok.is(tok::utf8_string_literal) || Tok.is(tok::utf16_string_literal) || Tok.is(tok::utf32_string_literal) || Tok.is(tok::wide_char_constant) || - Tok.is(tok::utf16_char_constant) || Tok.is(tok::utf32_char_constant)) + Tok.is(tok::utf8_char_constant) || Tok.is(tok::utf16_char_constant) || + Tok.is(tok::utf32_char_constant)) return true; // If this isn't identifier + string, we're done. diff --git a/clang/lib/Parse/ParseExpr.cpp b/clang/lib/Parse/ParseExpr.cpp index 208ead8..6913de9 100644 --- a/clang/lib/Parse/ParseExpr.cpp +++ b/clang/lib/Parse/ParseExpr.cpp @@ -910,6 +910,7 @@ ExprResult Parser::ParseCastExpression(bool isUnaryExpression, } case tok::char_constant: // constant: character-constant case tok::wide_char_constant: + case tok::utf8_char_constant: case tok::utf16_char_constant: case tok::utf32_char_constant: Res = Actions.ActOnCharacterConstant(Tok, /*UDLScope*/getCurScope()); diff --git a/clang/lib/Parse/ParseTentative.cpp b/clang/lib/Parse/ParseTentative.cpp index 944e887..1f39c255 100644 --- a/clang/lib/Parse/ParseTentative.cpp +++ b/clang/lib/Parse/ParseTentative.cpp @@ -892,6 +892,7 @@ Parser::isExpressionOrTypeSpecifierSimple(tok::TokenKind Kind) { case tok::numeric_constant: case tok::char_constant: case tok::wide_char_constant: + case tok::utf8_char_constant: case tok::utf16_char_constant: case tok::utf32_char_constant: case tok::string_literal: diff --git a/clang/test/Lexer/utf8-char-literal.cpp b/clang/test/Lexer/utf8-char-literal.cpp index 7a4d126..0ddaabc 100644 --- a/clang/test/Lexer/utf8-char-literal.cpp +++ b/clang/test/Lexer/utf8-char-literal.cpp @@ -1,6 +1,15 @@ // RUN: %clang_cc1 -triple x86_64-apple-darwin -std=c++11 -fsyntax-only -verify %s // RUN: %clang_cc1 -triple x86_64-apple-darwin -std=c11 -x c -fsyntax-only -verify %s +// RUN: %clang_cc1 -triple x86_64-apple-darwin -std=c++1z -fsyntax-only -verify %s int array0[u'ñ' == u'\xf1'? 1 : -1]; int array1['\xF1' != u'\xf1'? 1 : -1]; int array1['ñ' != u'\xf1'? 1 : -1]; // expected-error {{character too large for enclosing character literal type}} +#if __cplusplus > 201402L +char a = u8'ñ'; // expected-error {{character too large for enclosing character literal type}} +char b = u8'\x80'; // ok +char c = u8'\u0080'; // expected-error {{character too large for enclosing character literal type}} +char d = u8'\u1234'; // expected-error {{character too large for enclosing character literal type}} +char e = u8'ሴ'; // expected-error {{character too large for enclosing character literal type}} +char f = u8'ab'; // expected-error {{Unicode character literals may not contain multiple characters}} +#endif diff --git a/clang/www/cxx_status.html b/clang/www/cxx_status.html index 2061714..345226b 100644 --- a/clang/www/cxx_status.html +++ b/clang/www/cxx_status.html @@ -549,12 +549,17 @@ as the draft C++1z standard evolves.

Fold expressions - N4295 + N4295 + SVN + + + u8 character literals + N4267 SVN Nested namespace definition - N4230 + N4230 SVN -- 2.7.4