From 31f4859c3e4d261d4a45118bb77d453138a6f7a9 Mon Sep 17 00:00:00 2001 From: Corentin Jabot Date: Sun, 30 Oct 2022 23:20:00 +0100 Subject: [PATCH] [Clang] Allow additional mathematical symbols in identifiers. Implement the proposed UAX Profile "Mathematical notation profile for default identifiers". This implements a not-yet approved Unicode for a vetted UAX31 identifier profile https://www.unicode.org/L2/L2022/22230-math-profile.pdf This change mitigates the reported disruption caused by the implementation of UAX31 in C++ and C2x, as these mathematical symbols are commonly used in the scientific community. Fixes #54732 Reviewed By: tahonermann, #clang-language-wg Differential Revision: https://reviews.llvm.org/D137051 --- clang/docs/ReleaseNotes.rst | 5 ++ clang/include/clang/Basic/DiagnosticLexKinds.td | 3 + clang/lib/Lex/Lexer.cpp | 111 ++++++++++++++++++------ clang/lib/Lex/UnicodeCharSets.h | 30 +++++++ clang/test/Driver/autocomplete.c | 1 + clang/test/Lexer/unicode.c | 10 +++ 6 files changed, 133 insertions(+), 27 deletions(-) diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index d9b44b6..09705a6 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -441,6 +441,11 @@ Non-comprehensive list of changes in this release - Unicode support has been updated to support Unicode 15.0. New unicode codepoints are supported as appropriate in diagnostics, C and C++ identifiers, and escape sequences. +- In identifiers, Clang allows a restricted set of additional mathematical symbols + as an extension. These symbols correspond to a proposed Unicode + `Mathematical notation profile for default identifiers + `_. + This resolves `Issue 54732 `_. - Clang now supports loading multiple configuration files. The files from default configuration paths are loaded first, unless ``--no-default-config`` option is used. All files explicitly specified using ``--config=`` option diff --git a/clang/include/clang/Basic/DiagnosticLexKinds.td b/clang/include/clang/Basic/DiagnosticLexKinds.td index a915f75..3b1b466 100644 --- a/clang/include/clang/Basic/DiagnosticLexKinds.td +++ b/clang/include/clang/Basic/DiagnosticLexKinds.td @@ -132,6 +132,9 @@ def warn_utf8_symbol_homoglyph : Warning< def warn_utf8_symbol_zero_width : Warning< "identifier contains Unicode character that is invisible in " "some environments">, InGroup>; +def ext_mathematical_notation : ExtWarn< + "mathematical notation character in an identifier is a Clang extension">, + InGroup>; def ext_delimited_escape_sequence : Extension< "%select{delimited|named}0 escape sequences are a " diff --git a/clang/lib/Lex/Lexer.cpp b/clang/lib/Lex/Lexer.cpp index c93d334..d1af455 100644 --- a/clang/lib/Lex/Lexer.cpp +++ b/clang/lib/Lex/Lexer.cpp @@ -1459,7 +1459,35 @@ static bool isUnicodeWhitespace(uint32_t Codepoint) { return UnicodeWhitespaceChars.contains(Codepoint); } -static bool isAllowedIDChar(uint32_t C, const LangOptions &LangOpts) { +static llvm::SmallString<5> codepointAsHexString(uint32_t C) { + llvm::SmallString<5> CharBuf; + llvm::raw_svector_ostream CharOS(CharBuf); + llvm::write_hex(CharOS, C, llvm::HexPrintStyle::Upper, 4); + return CharBuf; +} + +// To mitigate https://github.com/llvm/llvm-project/issues/54732, +// we allow "Mathematical Notation Characters" in identifiers. +// This is a proposed profile that extends the XID_Start/XID_continue +// with mathematical symbols, superscipts and subscripts digits +// found in some production software. +// https://www.unicode.org/L2/L2022/22230-math-profile.pdf +static bool isMathematicalExtensionID(uint32_t C, const LangOptions &LangOpts, + bool IsStart, bool &IsExtension) { + static const llvm::sys::UnicodeCharSet MathStartChars( + MathematicalNotationProfileIDStartRanges); + static const llvm::sys::UnicodeCharSet MathContinueChars( + MathematicalNotationProfileIDContinueRanges); + if (MathStartChars.contains(C) || + (!IsStart && MathContinueChars.contains(C))) { + IsExtension = true; + return true; + } + return false; +} + +static bool isAllowedIDChar(uint32_t C, const LangOptions &LangOpts, + bool &IsExtension) { if (LangOpts.AsmPreprocessor) { return false; } else if (LangOpts.DollarIdents && '$' == C) { @@ -1471,8 +1499,10 @@ static bool isAllowedIDChar(uint32_t C, const LangOptions &LangOpts) { // '_' doesn't have the XID_Continue property but is allowed in C and C++. static const llvm::sys::UnicodeCharSet XIDStartChars(XIDStartRanges); static const llvm::sys::UnicodeCharSet XIDContinueChars(XIDContinueRanges); - return C == '_' || XIDStartChars.contains(C) || - XIDContinueChars.contains(C); + if (C == '_' || XIDStartChars.contains(C) || XIDContinueChars.contains(C)) + return true; + return isMathematicalExtensionID(C, LangOpts, /*IsStart=*/false, + IsExtension); } else if (LangOpts.C11) { static const llvm::sys::UnicodeCharSet C11AllowedIDChars( C11AllowedIDCharRanges); @@ -1484,16 +1514,21 @@ static bool isAllowedIDChar(uint32_t C, const LangOptions &LangOpts) { } } -static bool isAllowedInitiallyIDChar(uint32_t C, const LangOptions &LangOpts) { +static bool isAllowedInitiallyIDChar(uint32_t C, const LangOptions &LangOpts, + bool &IsExtension) { assert(C > 0x7F && "isAllowedInitiallyIDChar called with an ASCII codepoint"); + IsExtension = false; if (LangOpts.AsmPreprocessor) { return false; } if (LangOpts.CPlusPlus || LangOpts.C2x) { static const llvm::sys::UnicodeCharSet XIDStartChars(XIDStartRanges); - return XIDStartChars.contains(C); + if (XIDStartChars.contains(C)) + return true; + return isMathematicalExtensionID(C, LangOpts, /*IsStart=*/true, + IsExtension); } - if (!isAllowedIDChar(C, LangOpts)) + if (!isAllowedIDChar(C, LangOpts, IsExtension)) return false; if (LangOpts.C11) { static const llvm::sys::UnicodeCharSet C11DisallowedInitialIDChars( @@ -1505,6 +1540,20 @@ static bool isAllowedInitiallyIDChar(uint32_t C, const LangOptions &LangOpts) { return !C99DisallowedInitialIDChars.contains(C); } +static void diagnoseExtensionInIdentifier(DiagnosticsEngine &Diags, uint32_t C, + CharSourceRange Range) { + + static const llvm::sys::UnicodeCharSet MathStartChars( + MathematicalNotationProfileIDStartRanges); + static const llvm::sys::UnicodeCharSet MathContinueChars( + MathematicalNotationProfileIDContinueRanges); + + assert((MathStartChars.contains(C) || MathContinueChars.contains(C)) && + "Unexpected mathematical notation codepoint"); + Diags.Report(Range.getBegin(), diag::ext_mathematical_notation) + << codepointAsHexString(C) << Range; +} + static inline CharSourceRange makeCharRange(Lexer &L, const char *Begin, const char *End) { return CharSourceRange::getCharRange(L.getSourceLocation(Begin), @@ -1604,18 +1653,13 @@ static void maybeDiagnoseUTF8Homoglyph(DiagnosticsEngine &Diags, uint32_t C, std::lower_bound(std::begin(SortedHomoglyphs), std::end(SortedHomoglyphs) - 1, HomoglyphPair{C, '\0'}); if (Homoglyph->Character == C) { - llvm::SmallString<5> CharBuf; - { - llvm::raw_svector_ostream CharOS(CharBuf); - llvm::write_hex(CharOS, C, llvm::HexPrintStyle::Upper, 4); - } if (Homoglyph->LooksLike) { const char LooksLikeStr[] = {Homoglyph->LooksLike, 0}; Diags.Report(Range.getBegin(), diag::warn_utf8_symbol_homoglyph) - << Range << CharBuf << LooksLikeStr; + << Range << codepointAsHexString(C) << LooksLikeStr; } else { Diags.Report(Range.getBegin(), diag::warn_utf8_symbol_zero_width) - << Range << CharBuf; + << Range << codepointAsHexString(C); } } } @@ -1626,25 +1670,24 @@ static void diagnoseInvalidUnicodeCodepointInIdentifier( if (isASCII(CodePoint)) return; - bool IsIDStart = isAllowedInitiallyIDChar(CodePoint, LangOpts); - bool IsIDContinue = IsIDStart || isAllowedIDChar(CodePoint, LangOpts); + bool IsExtension; + bool IsIDStart = isAllowedInitiallyIDChar(CodePoint, LangOpts, IsExtension); + bool IsIDContinue = + IsIDStart || isAllowedIDChar(CodePoint, LangOpts, IsExtension); if ((IsFirst && IsIDStart) || (!IsFirst && IsIDContinue)) return; bool InvalidOnlyAtStart = IsFirst && !IsIDStart && IsIDContinue; - llvm::SmallString<5> CharBuf; - llvm::raw_svector_ostream CharOS(CharBuf); - llvm::write_hex(CharOS, CodePoint, llvm::HexPrintStyle::Upper, 4); - if (!IsFirst || InvalidOnlyAtStart) { Diags.Report(Range.getBegin(), diag::err_character_not_allowed_identifier) - << Range << CharBuf << int(InvalidOnlyAtStart) + << Range << codepointAsHexString(CodePoint) << int(InvalidOnlyAtStart) << FixItHint::CreateRemoval(Range); } else { Diags.Report(Range.getBegin(), diag::err_character_not_allowed) - << Range << CharBuf << FixItHint::CreateRemoval(Range); + << Range << codepointAsHexString(CodePoint) + << FixItHint::CreateRemoval(Range); } } @@ -1655,8 +1698,8 @@ bool Lexer::tryConsumeIdentifierUCN(const char *&CurPtr, unsigned Size, if (CodePoint == 0) { return false; } - - if (!isAllowedIDChar(CodePoint, LangOpts)) { + bool IsExtension = false; + if (!isAllowedIDChar(CodePoint, LangOpts, IsExtension)) { if (isASCII(CodePoint) || isUnicodeWhitespace(CodePoint)) return false; if (!isLexingRawMode() && !ParsingPreprocessorDirective && @@ -1669,10 +1712,15 @@ bool Lexer::tryConsumeIdentifierUCN(const char *&CurPtr, unsigned Size, // We got a unicode codepoint that is neither a space nor a // a valid identifier part. // Carry on as if the codepoint was valid for recovery purposes. - } else if (!isLexingRawMode()) + } else if (!isLexingRawMode()) { + if (IsExtension) + diagnoseExtensionInIdentifier(PP->getDiagnostics(), CodePoint, + makeCharRange(*this, CurPtr, UCNPtr)); + maybeDiagnoseIDCharCompat(PP->getDiagnostics(), CodePoint, makeCharRange(*this, CurPtr, UCNPtr), /*IsFirst=*/false); + } Result.setFlag(Token::HasUCN); if ((UCNPtr - CurPtr == 6 && CurPtr[1] == 'u') || @@ -1695,7 +1743,9 @@ bool Lexer::tryConsumeIdentifierUTF8Char(const char *&CurPtr) { if (Result != llvm::conversionOK) return false; - if (!isAllowedIDChar(static_cast(CodePoint), LangOpts)) { + bool IsExtension = false; + if (!isAllowedIDChar(static_cast(CodePoint), LangOpts, + IsExtension)) { if (isASCII(CodePoint) || isUnicodeWhitespace(CodePoint)) return false; @@ -1708,6 +1758,9 @@ bool Lexer::tryConsumeIdentifierUTF8Char(const char *&CurPtr) { // a valid identifier part. Carry on as if the codepoint was // valid for recovery purposes. } else if (!isLexingRawMode()) { + if (IsExtension) + diagnoseExtensionInIdentifier(PP->getDiagnostics(), CodePoint, + makeCharRange(*this, CurPtr, UnicodePtr)); maybeDiagnoseIDCharCompat(PP->getDiagnostics(), CodePoint, makeCharRange(*this, CurPtr, UnicodePtr), /*IsFirst=*/false); @@ -1721,9 +1774,13 @@ bool Lexer::tryConsumeIdentifierUTF8Char(const char *&CurPtr) { bool Lexer::LexUnicodeIdentifierStart(Token &Result, uint32_t C, const char *CurPtr) { - if (isAllowedInitiallyIDChar(C, LangOpts)) { + bool IsExtension = false; + if (isAllowedInitiallyIDChar(C, LangOpts, IsExtension)) { if (!isLexingRawMode() && !ParsingPreprocessorDirective && !PP->isPreprocessedOutput()) { + if (IsExtension) + diagnoseExtensionInIdentifier(PP->getDiagnostics(), C, + makeCharRange(*this, BufferPtr, CurPtr)); maybeDiagnoseIDCharCompat(PP->getDiagnostics(), C, makeCharRange(*this, BufferPtr, CurPtr), /*IsFirst=*/true); @@ -1737,7 +1794,7 @@ bool Lexer::LexUnicodeIdentifierStart(Token &Result, uint32_t C, if (!isLexingRawMode() && !ParsingPreprocessorDirective && !PP->isPreprocessedOutput() && !isASCII(*BufferPtr) && - !isAllowedInitiallyIDChar(C, LangOpts) && !isUnicodeWhitespace(C)) { + !isUnicodeWhitespace(C)) { // Non-ASCII characters tend to creep into source code unintentionally. // Instead of letting the parser complain about the unknown token, // just drop the character. diff --git a/clang/lib/Lex/UnicodeCharSets.h b/clang/lib/Lex/UnicodeCharSets.h index f827217..5316d25 100644 --- a/clang/lib/Lex/UnicodeCharSets.h +++ b/clang/lib/Lex/UnicodeCharSets.h @@ -366,6 +366,36 @@ static const llvm::sys::UnicodeCharRange XIDContinueRanges[] = { {0x1E4EC, 0x1E4F9}, {0x1E8D0, 0x1E8D6}, {0x1E944, 0x1E94A}, {0x1E950, 0x1E959}, {0x1FBF0, 0x1FBF9}, {0xE0100, 0xE01EF}}; +// Clang supports the "Mathematical notation profile" as an extension, +// as described in https://www.unicode.org/L2/L2022/22230-math-profile.pdf +// Math_Start +static const llvm::sys::UnicodeCharRange + MathematicalNotationProfileIDStartRanges[] = { + {0x02202, 0x02202}, // ∂ + {0x02207, 0x02207}, // ∇ + {0x0221E, 0x0221E}, // ∞ + {0x1D6C1, 0x1D6C1}, // 𝛁 + {0x1D6DB, 0x1D6DB}, // 𝛛 + {0x1D6FB, 0x1D6FB}, // 𝛻 + {0x1D715, 0x1D715}, // 𝜕 + {0x1D735, 0x1D735}, // 𝜵 + {0x1D74F, 0x1D74F}, // 𝝏 + {0x1D76F, 0x1D76F}, // 𝝯 + {0x1D789, 0x1D789}, // 𝞉 + {0x1D7A9, 0x1D7A9}, // 𝞩 + {0x1D7C3, 0x1D7C3}, // 𝟃 +}; + +// Math_Continue +static const llvm::sys::UnicodeCharRange + MathematicalNotationProfileIDContinueRanges[] = { + {0x000B2, 0x000B3}, // ²-³ + {0x000B9, 0x000B9}, // ¹ + {0x02070, 0x02070}, // ⁰ + {0x02074, 0x0207E}, // ⁴-⁾ + {0x02080, 0x0208E}, // ₀-₎ +}; + // C11 D.1, C++11 [charname.allowed] static const llvm::sys::UnicodeCharRange C11AllowedIDCharRanges[] = { // 1 diff --git a/clang/test/Driver/autocomplete.c b/clang/test/Driver/autocomplete.c index 59055ef..502eee1 100644 --- a/clang/test/Driver/autocomplete.c +++ b/clang/test/Driver/autocomplete.c @@ -111,6 +111,7 @@ // WARNING-NEXT: -Wmain-return-type // WARNING-NEXT: -Wmalformed-warning-check // WARNING-NEXT: -Wmany-braces-around-scalar-init +// WARNING-NEXT: -Wmathematical-notation-identifier-extension // WARNING-NEXT: -Wmax-tokens // WARNING-NEXT: -Wmax-unsigned-zero // RUN: %clang --autocomplete=-Wno-invalid-pp- | FileCheck %s -check-prefix=NOWARNING diff --git a/clang/test/Lexer/unicode.c b/clang/test/Lexer/unicode.c index d79a6ed..d120d6c 100644 --- a/clang/test/Lexer/unicode.c +++ b/clang/test/Lexer/unicode.c @@ -46,7 +46,17 @@ extern int _\N{TANGSALETTERGA}; // expected-error {{'TANGSALETTERGA' is not a va // expected-error {{expected ';' after top level declarator}} \ // expected-note {{characters names in Unicode escape sequences are sensitive to case and whitespace}} +extern int 𝛛; // expected-warning {{mathematical notation character in an identifier is a Clang extension}} +extern int ₉; // expected-error {{character not allowed at the start of an identifier}} \\ + expected-warning {{declaration does not declare anything}} +int a¹b₍₄₂₎∇; // expected-warning 6{{mathematical notation character}} + +int \u{221E} = 1; // expected-warning {{mathematical notation character}} +int \N{MATHEMATICAL SANS-SERIF BOLD ITALIC PARTIAL DIFFERENTIAL} = 1; + // expected-warning@-1 {{mathematical notation character}} + +int a\N{SUBSCRIPT EQUALS SIGN} = 1; // expected-warning {{mathematical notation character}} // This character doesn't have the XID_Start property extern int \U00016AC0; // TANGSA DIGIT ZERO // cxx-error {{expected unqualified-id}} \ -- 2.7.4