[OSX] HybridGlobalization Workaround for insufficiently sized destination buffer...

author Meri Khamoyan <96171496+mkhamoyan@users.noreply.github.com>

Tue, 4 Jul 2023 06:20:43 +0000 (10:20 +0400)

committer GitHub <noreply@github.com>

Tue, 4 Jul 2023 06:20:43 +0000 (10:20 +0400)
author Meri Khamoyan <96171496+mkhamoyan@users.noreply.github.com>
Tue, 4 Jul 2023 06:20:43 +0000 (10:20 +0400)
committer GitHub <noreply@github.com>
Tue, 4 Jul 2023 06:20:43 +0000 (10:20 +0400)
diff --git a/docs/design/features/globalization-hybrid-mode.md b/docs/design/features/globalization-hybrid-mode.md

index 920f808..7f7b3bb 100644 (file)
--- a/docs/design/features/globalization-hybrid-mode.md
+++ b/docs/design/features/globalization-hybrid-mode.md
@@ -423,19 +423,3 @@ Below function are used from apple native functions:
  - [uppercaseStringWithLocale](https://developer.apple.com/documentation/foundation/nsstring/1413316-uppercasestringwithlocale?language=objc)
  - [lowercaseStringWithLocale](https://developer.apple.com/documentation/foundation/nsstring/1417298-lowercasestringwithlocale?language=objc)
  
-Behavioural changes compared to ICU
-
-   - Final sigma behavior correction:
-
-     ICU-based case change does not respect final-sigma rule, but hybrid does, so "ΒΌΛΟΣ" -> "βόλος", not "βόλοσ".
-
-   - Below cases will throw exception because of insufficiently sized destination buffer
-
-      - Capitalizing the German letter ß (sharp S) gives SS when using Apple native functions.
-
-      - Capitalizing ligatures gives different result on Apple platforms, eg. "\uFB00" (ﬀ) uppercase (FF)
-
-      - Capitalizing "\u0149" (ŉ) on Apple platforms returns combination of  "\u02BC" (ʼ) and N -> (ʼN)
-
-
-
diff --git a/src/libraries/System.Globalization/tests/System/Globalization/TextInfoTests.cs b/src/libraries/System.Globalization/tests/System/Globalization/TextInfoTests.cs

index 6426a2a..466be98 100644 (file)
--- a/src/libraries/System.Globalization/tests/System/Globalization/TextInfoTests.cs
+++ b/src/libraries/System.Globalization/tests/System/Globalization/TextInfoTests.cs
@@ -274,9 +274,9 @@ namespace System.Globalization.Tests
                  // we also don't preform.
                  // Greek Capital Letter Sigma (does not case to U+03C2 with "final sigma" rule).
                  yield return new object[] { cultureName, "\u03A3", "\u03C3" };
-                if (PlatformDetection.IsHybridGlobalizationOnBrowser || PlatformDetection.IsHybridGlobalizationOnOSX)
+                if (PlatformDetection.IsHybridGlobalizationOnBrowser)
                  {
-                    // JS and Apple platforms are using "final sigma" rule correctly - it's costly to unify it with ICU's behavior
+                    // JS is using "final sigma" rule correctly - it's costly to unify it with ICU's behavior
                      yield return new object[] { cultureName, "O\u03A3", "o\u03C2" };
                  }
                  else
@@ -396,29 +396,24 @@ namespace System.Globalization.Tests
                  // RAINBOW (outside the BMP and does not case)
                  yield return new object[] { cultureName, "\U0001F308", "\U0001F308" };
  
-                if (!PlatformDetection.IsHybridGlobalizationOnOSX)
-                {
-                    // Unicode defines some codepoints which expand into multiple codepoints
-                    // when cased (see SpecialCasing.txt from UNIDATA for some examples). We have never done
-                    // these sorts of expansions, since it would cause string lengths to change when cased,
-                    // which is non-intuitive. In addition, there are some context sensitive mappings which
-                    // we also don't preform.
-                    // es-zed does not case to SS when uppercased.
-                    // on OSX, capitalizing the German letter ß (sharp S) gives SS
-                    yield return new object[] { cultureName, "\u00DF", "\u00DF" };
-                    yield return new object[] { cultureName, "stra\u00DFe", "STRA\u00DFE" };
-                    if (!PlatformDetection.IsNlsGlobalization)
-                        yield return new object[] { cultureName, "st\uD801\uDC37ra\u00DFe", "ST\uD801\uDC0FRA\u00DFE" };
-
-                    // Ligatures do not expand when cased.
-                    // on OSX, this is uppercase to "FF"
-                    yield return new object[] { cultureName, "\uFB00", "\uFB00" };
-
-                    // Precomposed character with no uppercase variant, we don't want to "decompose" this
-                    // as part of casing.
-                    // on OSX, this is uppercased to "ʼN"
-                    yield return new object[] { cultureName, "\u0149", "\u0149" };
-                }
+                
+                // Unicode defines some codepoints which expand into multiple codepoints
+                // when cased (see SpecialCasing.txt from UNIDATA for some examples). We have never done
+                // these sorts of expansions, since it would cause string lengths to change when cased,
+                // which is non-intuitive. In addition, there are some context sensitive mappings which
+                // we also don't preform.
+                // es-zed does not case to SS when uppercased.
+                yield return new object[] { cultureName, "\u00DF", "\u00DF" };
+                yield return new object[] { cultureName, "stra\u00DFe", "STRA\u00DFE" };
+                if (!PlatformDetection.IsNlsGlobalization)
+                    yield return new object[] { cultureName, "st\uD801\uDC37ra\u00DFe", "ST\uD801\uDC0FRA\u00DFE" };
+
+                // Ligatures do not expand when cased.
+                yield return new object[] { cultureName, "\uFB00", "\uFB00" };
+
+                // Precomposed character with no uppercase variant, we don't want to "decompose" this
+                // as part of casing.
+                yield return new object[] { cultureName, "\u0149", "\u0149" };
              }
  
              // Turkish i
diff --git a/src/native/libs/System.Globalization.Native/pal_casing.m b/src/native/libs/System.Globalization.Native/pal_casing.m

index 7aeacb5..e0bd0f8 100644 (file)
--- a/src/native/libs/System.Globalization.Native/pal_casing.m
+++ b/src/native/libs/System.Globalization.Native/pal_casing.m
@@ -9,6 +9,47 @@
  
  #if defined(TARGET_OSX) || defined(TARGET_MACCATALYST) || defined(TARGET_IOS) || defined(TARGET_TVOS)
  
+/**
+ * Is this code unit a lead surrogate (U+d800..U+dbff)?
+ * @param c 16-bit code unit
+ * @return true or false
+ */
+#define IS_LEAD(c) (((c)&0xfffffc00) == 0xd800)
+
+/**
+ * Is this code unit a trail surrogate (U+dc00..U+dfff)?
+ * @param c 16-bit code unit
+ * @return true or false
+ */
+#define IS_TRAIL(c) (((c)&0xfffffc00) == 0xdc00)
+
+/**
+ * Get a code point index from a string at a code point boundary offset,
+ * and advance the offset to the next code point boundary.
+ * (Post-incrementing forward iteration.)
+ * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
+ *
+ * The length can be negative for a NUL-terminated string.
+ *
+ * The offset may point to the lead surrogate unit
+ * for a supplementary code point, in which case for casing will be read
+ * the following trail surrogate as well.
+ * If the offset points to a trail surrogate or
+ * to a single, unpaired lead surrogate, then for casing will be read that unpaired surrogate.
+ *
+ * @param s const uint16_t* string
+ * @param i output string offset, must be i<length
+ * @param length string length
+ */
+#define NEXTOFFSET(s, i, length) { \
+    uint16_t c = (s)[(i)++]; \
+    if (IS_LEAD(c)) { \
+        uint16_t __c2; \
+        if ((i) != (length) && IS_TRAIL(__c2 = (s)[(i)])) { \
+            ++(i); \
+        } \
+    } \
+}
  
  /**
   * Append a code point to a string, overwriting 1 or 2 code units.
@@ -46,6 +87,11 @@ Function:
  ChangeCaseNative
  
  Performs upper or lower casing of a string into a new buffer, taking into account the specified locale.
+Two things we are considering here:
+1. Prohibiting code point expansions. Some characters code points expand when uppercased or lowercased, which may lead to an insufficient destination buffer.
+   Instead, we prohibit these expansions and iterate through the string character by character opting for the original character if it would have been expanded.
+2. Properly handling surrogate pairs. Characters can be comprised of more than one code point
+   (i.e. surrogate pairs like \uD801\uDC37). All code points for a character are needed to properly change case
  Returns 0 for success, non-zero on failure see ErrorCodes.
  */
  int32_t GlobalizationNative_ChangeCaseNative(const uint16_t* localeName, int32_t lNameLength,
@@ -61,15 +107,25 @@ int32_t GlobalizationNative_ChangeCaseNative(const uint16_t* localeName, int32_t
          NSString *locName = [NSString stringWithCharacters: localeName length: lNameLength];
          currentLocale = [NSLocale localeWithLocaleIdentifier:locName];
      }
-    NSString *source = [NSString stringWithCharacters: lpSrc length: cwSrcLength];
-    NSString *result = bToUpper ? [source uppercaseStringWithLocale:currentLocale] : [source lowercaseStringWithLocale:currentLocale];
  
      int32_t srcIdx = 0, dstIdx = 0, isError = 0;
      uint16_t dstCodepoint;
-    while (srcIdx < result.length)
+    while (srcIdx < cwSrcLength)
      {
-        dstCodepoint = [result characterAtIndex:srcIdx++];
-        Append(lpDst, dstIdx, cwDstLength, dstCodepoint, isError);
+        int32_t startIndex = srcIdx;
+        NEXTOFFSET(lpSrc, srcIdx, cwSrcLength);
+        int32_t srcLength = srcIdx - startIndex;
+        NSString *src = [NSString stringWithCharacters: lpSrc + startIndex length: srcLength];
+        NSString *dst = bToUpper ? [src uppercaseStringWithLocale:currentLocale] : [src lowercaseStringWithLocale:currentLocale];
+        int32_t index = 0;
+        // iterate over all code points of a surrogate pair character
+        while (index < srcLength)
+        {
+            // the dst.length > srcLength is to prevent code point expansions
+            dstCodepoint = dst.length > srcLength ? [src characterAtIndex: index] : [dst characterAtIndex: index];
+            Append(lpDst, dstIdx, cwDstLength, dstCodepoint, isError);
+            index++;
+        }
          if (isError)
              return isError;
      }
@@ -81,19 +137,33 @@ Function:
  ChangeCaseInvariantNative
  
  Performs upper or lower casing of a string into a new buffer.
+Two things we are considering here:
+1. Prohibiting code point expansions. Some characters code points expand when uppercased or lowercased, which may lead to an insufficient destination buffer.
+   Instead, we prohibit these expansions and iterate through the string character by character opting for the original character if it would have been expanded.
+2. Properly handling surrogate pairs. Characters can be comprised of more than one code point
+   (i.e. surrogate pairs like \uD801\uDC37). All code points for a character are needed to properly change case
  Returns 0 for success, non-zero on failure see ErrorCodes.
  */
  int32_t GlobalizationNative_ChangeCaseInvariantNative(const uint16_t* lpSrc, int32_t cwSrcLength, uint16_t* lpDst, int32_t cwDstLength, int32_t bToUpper)
  {
-    NSString *source = [NSString stringWithCharacters: lpSrc length: cwSrcLength];
-    NSString *result = bToUpper ? source.uppercaseString : source.lowercaseString;
-
      int32_t srcIdx = 0, dstIdx = 0, isError = 0;
      uint16_t dstCodepoint;
-    while (srcIdx < result.length)
+    while (srcIdx < cwSrcLength)
      {
-        dstCodepoint = [result characterAtIndex:srcIdx++];
-        Append(lpDst, dstIdx, cwDstLength, dstCodepoint, isError);
+        int32_t startIndex = srcIdx;
+        NEXTOFFSET(lpSrc, srcIdx, cwSrcLength);
+        int32_t srcLength = srcIdx - startIndex;
+        NSString *src = [NSString stringWithCharacters: lpSrc + startIndex length: srcLength];
+        NSString *dst = bToUpper ? src.uppercaseString : src.lowercaseString;
+        int32_t index = 0;
+        // iterate over all code points of a surrogate pair character
+        while (index < srcLength)
+        {
+            // the dst.length > srcLength is to prevent code point expansions
+            dstCodepoint = dst.length > srcLength ? [src characterAtIndex: index] : [dst characterAtIndex: index];
+            Append(lpDst, dstIdx, cwDstLength, dstCodepoint, isError);
+            index++;
+        }
          if (isError)
              return isError;
      }
author	Meri Khamoyan <96171496+mkhamoyan@users.noreply.github.com>
	Tue, 4 Jul 2023 06:20:43 +0000 (10:20 +0400)
committer	GitHub <noreply@github.com>
	Tue, 4 Jul 2023 06:20:43 +0000 (10:20 +0400)
docs/design/features/globalization-hybrid-mode.md		patch \| blob \| history
src/libraries/System.Globalization/tests/System/Globalization/TextInfoTests.cs		patch \| blob \| history
src/native/libs/System.Globalization.Native/pal_casing.m		patch \| blob \| history