From 33ec65305525626d5d93bd794c1c9cfa55d0ca8f Mon Sep 17 00:00:00 2001 From: =?utf8?q?Timm=20B=C3=A4der?= Date: Tue, 8 Feb 2022 10:13:11 +0100 Subject: [PATCH] [clang][lexer] Allow u8 character literal prefixes in C2x Implement N2418 for C2x. Differential Revision: https://reviews.llvm.org/D119221 --- clang/docs/ReleaseNotes.rst | 1 + clang/lib/Lex/Lexer.cpp | 9 ++++++--- clang/lib/Sema/SemaExpr.cpp | 5 ++++- clang/test/Lexer/utf8-char-literal.cpp | 13 +++++++++++++ clang/www/c_status.html | 2 +- 5 files changed, 25 insertions(+), 5 deletions(-) diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index fd4eac5..c396322 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -226,6 +226,7 @@ C2x Feature Support - Implemented `WG14 N2775 Literal suffixes for bit-precise integers `_. - Implemented the `*_WIDTH` macros to complete support for `WG14 N2412 Two's complement sign representation for C2x `_. +- Implemented `WG14 N2418 Adding the u8 character prefix `_. C++ Language Changes in Clang ----------------------------- diff --git a/clang/lib/Lex/Lexer.cpp b/clang/lib/Lex/Lexer.cpp index 6e8072f..6fc78cf 100644 --- a/clang/lib/Lex/Lexer.cpp +++ b/clang/lib/Lex/Lexer.cpp @@ -3459,7 +3459,10 @@ LexNextToken: MIOpt.ReadToken(); return LexNumericConstant(Result, CurPtr); - case 'u': // Identifier (uber) or C11/C++11 UTF-8 or UTF-16 string literal + // Identifier (e.g., uber), or + // UTF-8 (C2x/C++17) or UTF-16 (C11/C++11) character literal, or + // UTF-8 or UTF-16 string literal (C11/C++11). + case 'u': // Notify MIOpt that we read a non-whitespace/non-comment token. MIOpt.ReadToken(); @@ -3493,7 +3496,7 @@ LexNextToken: ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), SizeTmp2, Result), tok::utf8_string_literal); - if (Char2 == '\'' && LangOpts.CPlusPlus17) + if (Char2 == '\'' && (LangOpts.CPlusPlus17 || LangOpts.C2x)) return LexCharConstant( Result, ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), SizeTmp2, Result), @@ -3517,7 +3520,7 @@ LexNextToken: // treat u like the start of an identifier. return LexIdentifierContinue(Result, CurPtr); - case 'U': // Identifier (Uber) or C11/C++11 UTF-32 string literal + case 'U': // Identifier (e.g. Uber) or C11/C++11 UTF-32 string literal // Notify MIOpt that we read a non-whitespace/non-comment token. MIOpt.ReadToken(); diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp index 3e84961..bb0bde6 100644 --- a/clang/lib/Sema/SemaExpr.cpp +++ b/clang/lib/Sema/SemaExpr.cpp @@ -3609,6 +3609,8 @@ ExprResult Sema::ActOnCharacterConstant(const Token &Tok, Scope *UDLScope) { QualType Ty; if (Literal.isWide()) Ty = Context.WideCharTy; // L'x' -> wchar_t in C and C++. + else if (Literal.isUTF8() && getLangOpts().C2x) + Ty = Context.UnsignedCharTy; // u8'x' -> unsigned char in C2x else if (Literal.isUTF8() && getLangOpts().Char8) Ty = Context.Char8Ty; // u8'x' -> char8_t when it exists. else if (Literal.isUTF16()) @@ -3618,7 +3620,8 @@ ExprResult Sema::ActOnCharacterConstant(const Token &Tok, Scope *UDLScope) { else if (!getLangOpts().CPlusPlus || Literal.isMultiChar()) Ty = Context.IntTy; // 'x' -> int in C, 'wxyz' -> int in C++. else - Ty = Context.CharTy; // 'x' -> char in C++ + Ty = Context.CharTy; // 'x' -> char in C++; + // u8'x' -> char in C11-C17 and in C++ without char8_t. CharacterLiteral::CharacterKind Kind = CharacterLiteral::Ascii; if (Literal.isWide()) diff --git a/clang/test/Lexer/utf8-char-literal.cpp b/clang/test/Lexer/utf8-char-literal.cpp index 0ddaabc..ababc8b 100644 --- a/clang/test/Lexer/utf8-char-literal.cpp +++ b/clang/test/Lexer/utf8-char-literal.cpp @@ -1,5 +1,6 @@ // RUN: %clang_cc1 -triple x86_64-apple-darwin -std=c++11 -fsyntax-only -verify %s // RUN: %clang_cc1 -triple x86_64-apple-darwin -std=c11 -x c -fsyntax-only -verify %s +// RUN: %clang_cc1 -triple x86_64-apple-darwin -std=c2x -x c -fsyntax-only -verify %s // RUN: %clang_cc1 -triple x86_64-apple-darwin -std=c++1z -fsyntax-only -verify %s int array0[u'ñ' == u'\xf1'? 1 : -1]; @@ -12,4 +13,16 @@ char c = u8'\u0080'; // expected-error {{character too large for enclosing chara char d = u8'\u1234'; // expected-error {{character too large for enclosing character literal type}} char e = u8'ሴ'; // expected-error {{character too large for enclosing character literal type}} char f = u8'ab'; // expected-error {{Unicode character literals may not contain multiple characters}} +#elif __STDC_VERSION__ > 202000L +char a = u8'ñ'; // expected-error {{character too large for enclosing character literal type}} +char b = u8'\x80'; // ok +char c = u8'\u0080'; // expected-error {{universal character name refers to a control character}} +char d = u8'\u1234'; // expected-error {{character too large for enclosing character literal type}} +char e = u8'ሴ'; // expected-error {{character too large for enclosing character literal type}} +char f = u8'ab'; // expected-error {{Unicode character literals may not contain multiple characters}} +_Static_assert( + _Generic(u8'a', + default : 0, + unsigned char : 1), + "Surprise!"); #endif diff --git a/clang/www/c_status.html b/clang/www/c_status.html index 3a00648..b827d8d 100644 --- a/clang/www/c_status.html +++ b/clang/www/c_status.html @@ -720,7 +720,7 @@ conformance.

Adding the u8 character prefix N2418 - No + Clang 15 Remove support for function definitions with identifier lists -- 2.7.4