From 31f4859c3e4d261d4a45118bb77d453138a6f7a9 Mon Sep 17 00:00:00 2001
From: Corentin Jabot <corentinjabot@gmail.com>
Date: Sun, 30 Oct 2022 23:20:00 +0100
Subject: [PATCH] [Clang] Allow additional mathematical symbols in identifiers.

Implement the proposed UAX Profile
"Mathematical notation profile for default identifiers".

This implements a not-yet approved Unicode for a vetted
UAX31 identifier profile
https://www.unicode.org/L2/L2022/22230-math-profile.pdf

This change mitigates the reported disruption caused
by the implementation of UAX31 in C++ and C2x,
as these mathematical symbols are commonly used in the
scientific community.

Fixes #54732

Reviewed By: tahonermann, #clang-language-wg

Differential Revision: https://reviews.llvm.org/D137051
---
 clang/docs/ReleaseNotes.rst                     |   5 ++
 clang/include/clang/Basic/DiagnosticLexKinds.td |   3 +
 clang/lib/Lex/Lexer.cpp                         | 111 ++++++++++++++++++------
 clang/lib/Lex/UnicodeCharSets.h                 |  30 +++++++
 clang/test/Driver/autocomplete.c                |   1 +
 clang/test/Lexer/unicode.c                      |  10 +++
 6 files changed, 133 insertions(+), 27 deletions(-)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index d9b44b6..09705a6 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -441,6 +441,11 @@ Non-comprehensive list of changes in this release
 - Unicode support has been updated to support Unicode 15.0.
   New unicode codepoints are supported as appropriate in diagnostics,
   C and C++ identifiers, and escape sequences.
+- In identifiers, Clang allows a restricted set of additional mathematical symbols
+  as an extension. These symbols correspond to a proposed Unicode
+  `Mathematical notation profile for default identifiers
+  <https://www.unicode.org/L2/L2022/22230-math-profile.pdf>`_.
+  This resolves `Issue 54732 <https://github.com/llvm/llvm-project/issues/54732>`_.
 - Clang now supports loading multiple configuration files. The files from
   default configuration paths are loaded first, unless ``--no-default-config``
   option is used. All files explicitly specified using ``--config=`` option
diff --git a/clang/include/clang/Basic/DiagnosticLexKinds.td b/clang/include/clang/Basic/DiagnosticLexKinds.td
index a915f75..3b1b466 100644
--- a/clang/include/clang/Basic/DiagnosticLexKinds.td
+++ b/clang/include/clang/Basic/DiagnosticLexKinds.td
@@ -132,6 +132,9 @@ def warn_utf8_symbol_homoglyph : Warning<
 def warn_utf8_symbol_zero_width : Warning<
   "identifier contains Unicode character <U+%0> that is invisible in "
   "some environments">, InGroup<DiagGroup<"unicode-zero-width">>;
+def ext_mathematical_notation : ExtWarn<
+  "mathematical notation character <U+%0> in an identifier is a Clang extension">,
+  InGroup<DiagGroup<"mathematical-notation-identifier-extension">>;
 
 def ext_delimited_escape_sequence : Extension<
   "%select{delimited|named}0 escape sequences are a "
diff --git a/clang/lib/Lex/Lexer.cpp b/clang/lib/Lex/Lexer.cpp
index c93d334..d1af455 100644
--- a/clang/lib/Lex/Lexer.cpp
+++ b/clang/lib/Lex/Lexer.cpp
@@ -1459,7 +1459,35 @@ static bool isUnicodeWhitespace(uint32_t Codepoint) {
   return UnicodeWhitespaceChars.contains(Codepoint);
 }
 
-static bool isAllowedIDChar(uint32_t C, const LangOptions &LangOpts) {
+static llvm::SmallString<5> codepointAsHexString(uint32_t C) {
+  llvm::SmallString<5> CharBuf;
+  llvm::raw_svector_ostream CharOS(CharBuf);
+  llvm::write_hex(CharOS, C, llvm::HexPrintStyle::Upper, 4);
+  return CharBuf;
+}
+
+// To mitigate https://github.com/llvm/llvm-project/issues/54732,
+// we allow "Mathematical Notation Characters" in identifiers.
+// This is a proposed profile that extends the XID_Start/XID_continue
+// with mathematical symbols, superscipts and subscripts digits
+// found in some production software.
+// https://www.unicode.org/L2/L2022/22230-math-profile.pdf
+static bool isMathematicalExtensionID(uint32_t C, const LangOptions &LangOpts,
+                                      bool IsStart, bool &IsExtension) {
+  static const llvm::sys::UnicodeCharSet MathStartChars(
+      MathematicalNotationProfileIDStartRanges);
+  static const llvm::sys::UnicodeCharSet MathContinueChars(
+      MathematicalNotationProfileIDContinueRanges);
+  if (MathStartChars.contains(C) ||
+      (!IsStart && MathContinueChars.contains(C))) {
+    IsExtension = true;
+    return true;
+  }
+  return false;
+}
+
+static bool isAllowedIDChar(uint32_t C, const LangOptions &LangOpts,
+                            bool &IsExtension) {
   if (LangOpts.AsmPreprocessor) {
     return false;
   } else if (LangOpts.DollarIdents && '$' == C) {
@@ -1471,8 +1499,10 @@ static bool isAllowedIDChar(uint32_t C, const LangOptions &LangOpts) {
     // '_' doesn't have the XID_Continue property but is allowed in C and C++.
     static const llvm::sys::UnicodeCharSet XIDStartChars(XIDStartRanges);
     static const llvm::sys::UnicodeCharSet XIDContinueChars(XIDContinueRanges);
-    return C == '_' || XIDStartChars.contains(C) ||
-           XIDContinueChars.contains(C);
+    if (C == '_' || XIDStartChars.contains(C) || XIDContinueChars.contains(C))
+      return true;
+    return isMathematicalExtensionID(C, LangOpts, /*IsStart=*/false,
+                                     IsExtension);
   } else if (LangOpts.C11) {
     static const llvm::sys::UnicodeCharSet C11AllowedIDChars(
         C11AllowedIDCharRanges);
@@ -1484,16 +1514,21 @@ static bool isAllowedIDChar(uint32_t C, const LangOptions &LangOpts) {
   }
 }
 
-static bool isAllowedInitiallyIDChar(uint32_t C, const LangOptions &LangOpts) {
+static bool isAllowedInitiallyIDChar(uint32_t C, const LangOptions &LangOpts,
+                                     bool &IsExtension) {
   assert(C > 0x7F && "isAllowedInitiallyIDChar called with an ASCII codepoint");
+  IsExtension = false;
   if (LangOpts.AsmPreprocessor) {
     return false;
   }
   if (LangOpts.CPlusPlus || LangOpts.C2x) {
     static const llvm::sys::UnicodeCharSet XIDStartChars(XIDStartRanges);
-    return XIDStartChars.contains(C);
+    if (XIDStartChars.contains(C))
+      return true;
+    return isMathematicalExtensionID(C, LangOpts, /*IsStart=*/true,
+                                     IsExtension);
   }
-  if (!isAllowedIDChar(C, LangOpts))
+  if (!isAllowedIDChar(C, LangOpts, IsExtension))
     return false;
   if (LangOpts.C11) {
     static const llvm::sys::UnicodeCharSet C11DisallowedInitialIDChars(
@@ -1505,6 +1540,20 @@ static bool isAllowedInitiallyIDChar(uint32_t C, const LangOptions &LangOpts) {
   return !C99DisallowedInitialIDChars.contains(C);
 }
 
+static void diagnoseExtensionInIdentifier(DiagnosticsEngine &Diags, uint32_t C,
+                                          CharSourceRange Range) {
+
+  static const llvm::sys::UnicodeCharSet MathStartChars(
+      MathematicalNotationProfileIDStartRanges);
+  static const llvm::sys::UnicodeCharSet MathContinueChars(
+      MathematicalNotationProfileIDContinueRanges);
+
+  assert((MathStartChars.contains(C) || MathContinueChars.contains(C)) &&
+         "Unexpected mathematical notation codepoint");
+  Diags.Report(Range.getBegin(), diag::ext_mathematical_notation)
+      << codepointAsHexString(C) << Range;
+}
+
 static inline CharSourceRange makeCharRange(Lexer &L, const char *Begin,
                                             const char *End) {
   return CharSourceRange::getCharRange(L.getSourceLocation(Begin),
@@ -1604,18 +1653,13 @@ static void maybeDiagnoseUTF8Homoglyph(DiagnosticsEngine &Diags, uint32_t C,
       std::lower_bound(std::begin(SortedHomoglyphs),
                        std::end(SortedHomoglyphs) - 1, HomoglyphPair{C, '\0'});
   if (Homoglyph->Character == C) {
-    llvm::SmallString<5> CharBuf;
-    {
-      llvm::raw_svector_ostream CharOS(CharBuf);
-      llvm::write_hex(CharOS, C, llvm::HexPrintStyle::Upper, 4);
-    }
     if (Homoglyph->LooksLike) {
       const char LooksLikeStr[] = {Homoglyph->LooksLike, 0};
       Diags.Report(Range.getBegin(), diag::warn_utf8_symbol_homoglyph)
-          << Range << CharBuf << LooksLikeStr;
+          << Range << codepointAsHexString(C) << LooksLikeStr;
     } else {
       Diags.Report(Range.getBegin(), diag::warn_utf8_symbol_zero_width)
-          << Range << CharBuf;
+          << Range << codepointAsHexString(C);
     }
   }
 }
@@ -1626,25 +1670,24 @@ static void diagnoseInvalidUnicodeCodepointInIdentifier(
   if (isASCII(CodePoint))
     return;
 
-  bool IsIDStart = isAllowedInitiallyIDChar(CodePoint, LangOpts);
-  bool IsIDContinue = IsIDStart || isAllowedIDChar(CodePoint, LangOpts);
+  bool IsExtension;
+  bool IsIDStart = isAllowedInitiallyIDChar(CodePoint, LangOpts, IsExtension);
+  bool IsIDContinue =
+      IsIDStart || isAllowedIDChar(CodePoint, LangOpts, IsExtension);
 
   if ((IsFirst && IsIDStart) || (!IsFirst && IsIDContinue))
     return;
 
   bool InvalidOnlyAtStart = IsFirst && !IsIDStart && IsIDContinue;
 
-  llvm::SmallString<5> CharBuf;
-  llvm::raw_svector_ostream CharOS(CharBuf);
-  llvm::write_hex(CharOS, CodePoint, llvm::HexPrintStyle::Upper, 4);
-
   if (!IsFirst || InvalidOnlyAtStart) {
     Diags.Report(Range.getBegin(), diag::err_character_not_allowed_identifier)
-        << Range << CharBuf << int(InvalidOnlyAtStart)
+        << Range << codepointAsHexString(CodePoint) << int(InvalidOnlyAtStart)
         << FixItHint::CreateRemoval(Range);
   } else {
     Diags.Report(Range.getBegin(), diag::err_character_not_allowed)
-        << Range << CharBuf << FixItHint::CreateRemoval(Range);
+        << Range << codepointAsHexString(CodePoint)
+        << FixItHint::CreateRemoval(Range);
   }
 }
 
@@ -1655,8 +1698,8 @@ bool Lexer::tryConsumeIdentifierUCN(const char *&CurPtr, unsigned Size,
   if (CodePoint == 0) {
     return false;
   }
-
-  if (!isAllowedIDChar(CodePoint, LangOpts)) {
+  bool IsExtension = false;
+  if (!isAllowedIDChar(CodePoint, LangOpts, IsExtension)) {
     if (isASCII(CodePoint) || isUnicodeWhitespace(CodePoint))
       return false;
     if (!isLexingRawMode() && !ParsingPreprocessorDirective &&
@@ -1669,10 +1712,15 @@ bool Lexer::tryConsumeIdentifierUCN(const char *&CurPtr, unsigned Size,
     // We got a unicode codepoint that is neither a space nor a
     // a valid identifier part.
     // Carry on as if the codepoint was valid for recovery purposes.
-  } else if (!isLexingRawMode())
+  } else if (!isLexingRawMode()) {
+    if (IsExtension)
+      diagnoseExtensionInIdentifier(PP->getDiagnostics(), CodePoint,
+                                    makeCharRange(*this, CurPtr, UCNPtr));
+
     maybeDiagnoseIDCharCompat(PP->getDiagnostics(), CodePoint,
                               makeCharRange(*this, CurPtr, UCNPtr),
                               /*IsFirst=*/false);
+  }
 
   Result.setFlag(Token::HasUCN);
   if ((UCNPtr - CurPtr ==  6 && CurPtr[1] == 'u') ||
@@ -1695,7 +1743,9 @@ bool Lexer::tryConsumeIdentifierUTF8Char(const char *&CurPtr) {
   if (Result != llvm::conversionOK)
     return false;
 
-  if (!isAllowedIDChar(static_cast<uint32_t>(CodePoint), LangOpts)) {
+  bool IsExtension = false;
+  if (!isAllowedIDChar(static_cast<uint32_t>(CodePoint), LangOpts,
+                       IsExtension)) {
     if (isASCII(CodePoint) || isUnicodeWhitespace(CodePoint))
       return false;
 
@@ -1708,6 +1758,9 @@ bool Lexer::tryConsumeIdentifierUTF8Char(const char *&CurPtr) {
     // a valid identifier part. Carry on as if the codepoint was
     // valid for recovery purposes.
   } else if (!isLexingRawMode()) {
+    if (IsExtension)
+      diagnoseExtensionInIdentifier(PP->getDiagnostics(), CodePoint,
+                                    makeCharRange(*this, CurPtr, UnicodePtr));
     maybeDiagnoseIDCharCompat(PP->getDiagnostics(), CodePoint,
                               makeCharRange(*this, CurPtr, UnicodePtr),
                               /*IsFirst=*/false);
@@ -1721,9 +1774,13 @@ bool Lexer::tryConsumeIdentifierUTF8Char(const char *&CurPtr) {
 
 bool Lexer::LexUnicodeIdentifierStart(Token &Result, uint32_t C,
                                       const char *CurPtr) {
-  if (isAllowedInitiallyIDChar(C, LangOpts)) {
+  bool IsExtension = false;
+  if (isAllowedInitiallyIDChar(C, LangOpts, IsExtension)) {
     if (!isLexingRawMode() && !ParsingPreprocessorDirective &&
         !PP->isPreprocessedOutput()) {
+      if (IsExtension)
+        diagnoseExtensionInIdentifier(PP->getDiagnostics(), C,
+                                      makeCharRange(*this, BufferPtr, CurPtr));
       maybeDiagnoseIDCharCompat(PP->getDiagnostics(), C,
                                 makeCharRange(*this, BufferPtr, CurPtr),
                                 /*IsFirst=*/true);
@@ -1737,7 +1794,7 @@ bool Lexer::LexUnicodeIdentifierStart(Token &Result, uint32_t C,
 
   if (!isLexingRawMode() && !ParsingPreprocessorDirective &&
       !PP->isPreprocessedOutput() && !isASCII(*BufferPtr) &&
-      !isAllowedInitiallyIDChar(C, LangOpts) && !isUnicodeWhitespace(C)) {
+      !isUnicodeWhitespace(C)) {
     // Non-ASCII characters tend to creep into source code unintentionally.
     // Instead of letting the parser complain about the unknown token,
     // just drop the character.
diff --git a/clang/lib/Lex/UnicodeCharSets.h b/clang/lib/Lex/UnicodeCharSets.h
index f827217..5316d25 100644
--- a/clang/lib/Lex/UnicodeCharSets.h
+++ b/clang/lib/Lex/UnicodeCharSets.h
@@ -366,6 +366,36 @@ static const llvm::sys::UnicodeCharRange XIDContinueRanges[] = {
     {0x1E4EC, 0x1E4F9}, {0x1E8D0, 0x1E8D6}, {0x1E944, 0x1E94A},
     {0x1E950, 0x1E959}, {0x1FBF0, 0x1FBF9}, {0xE0100, 0xE01EF}};
 
+// Clang supports the "Mathematical notation profile" as an extension,
+// as described in https://www.unicode.org/L2/L2022/22230-math-profile.pdf
+// Math_Start
+static const llvm::sys::UnicodeCharRange
+    MathematicalNotationProfileIDStartRanges[] = {
+        {0x02202, 0x02202}, // â
+        {0x02207, 0x02207}, // â
+        {0x0221E, 0x0221E}, // â
+        {0x1D6C1, 0x1D6C1}, // ð
+        {0x1D6DB, 0x1D6DB}, // ð
+        {0x1D6FB, 0x1D6FB}, // ð»
+        {0x1D715, 0x1D715}, // ð
+        {0x1D735, 0x1D735}, // ðµ
+        {0x1D74F, 0x1D74F}, // ð
+        {0x1D76F, 0x1D76F}, // ð¯
+        {0x1D789, 0x1D789}, // ð
+        {0x1D7A9, 0x1D7A9}, // ð©
+        {0x1D7C3, 0x1D7C3}, // ð
+};
+
+// Math_Continue
+static const llvm::sys::UnicodeCharRange
+    MathematicalNotationProfileIDContinueRanges[] = {
+        {0x000B2, 0x000B3}, // Â²-Â³
+        {0x000B9, 0x000B9}, // Â¹
+        {0x02070, 0x02070}, // â°
+        {0x02074, 0x0207E}, // â´-â¾
+        {0x02080, 0x0208E}, // â-â
+};
+
 // C11 D.1, C++11 [charname.allowed]
 static const llvm::sys::UnicodeCharRange C11AllowedIDCharRanges[] = {
   // 1
diff --git a/clang/test/Driver/autocomplete.c b/clang/test/Driver/autocomplete.c
index 59055ef..502eee1 100644
--- a/clang/test/Driver/autocomplete.c
+++ b/clang/test/Driver/autocomplete.c
@@ -111,6 +111,7 @@
 // WARNING-NEXT: -Wmain-return-type
 // WARNING-NEXT: -Wmalformed-warning-check
 // WARNING-NEXT: -Wmany-braces-around-scalar-init
+// WARNING-NEXT: -Wmathematical-notation-identifier-extension
 // WARNING-NEXT: -Wmax-tokens
 // WARNING-NEXT: -Wmax-unsigned-zero
 // RUN: %clang --autocomplete=-Wno-invalid-pp- | FileCheck %s -check-prefix=NOWARNING
diff --git a/clang/test/Lexer/unicode.c b/clang/test/Lexer/unicode.c
index d79a6ed..d120d6c 100644
--- a/clang/test/Lexer/unicode.c
+++ b/clang/test/Lexer/unicode.c
@@ -46,7 +46,17 @@ extern int _\N{TANGSALETTERGA}; // expected-error {{'TANGSALETTERGA' is not a va
                                 // expected-error {{expected ';' after top level declarator}} \
                                 // expected-note {{characters names in Unicode escape sequences are sensitive to case and whitespace}}
 
+extern int ð; // expected-warning {{mathematical notation character <U+1D6DB> in an identifier is a Clang extension}}
+extern int â; // expected-error {{character <U+2089> not allowed at the start of an identifier}} \\
+                 expected-warning {{declaration does not declare anything}}
 
+int aÂ¹bâââââ; // expected-warning 6{{mathematical notation character}}
+
+int \u{221E} = 1; // expected-warning {{mathematical notation character}}
+int \N{MATHEMATICAL SANS-SERIF BOLD ITALIC PARTIAL DIFFERENTIAL} = 1;
+                 // expected-warning@-1 {{mathematical notation character}}
+
+int a\N{SUBSCRIPT EQUALS SIGN} = 1; // expected-warning {{mathematical notation character}}
 
 // This character doesn't have the XID_Start property
 extern int  \U00016AC0; // TANGSA DIGIT ZERO  // cxx-error {{expected unqualified-id}} \
-- 
2.7.4