// we also don't preform.
// Greek Capital Letter Sigma (does not case to U+03C2 with "final sigma" rule).
yield return new object[] { cultureName, "\u03A3", "\u03C3" };
- if (PlatformDetection.IsHybridGlobalizationOnBrowser || PlatformDetection.IsHybridGlobalizationOnOSX)
+ if (PlatformDetection.IsHybridGlobalizationOnBrowser)
{
- // JS and Apple platforms are using "final sigma" rule correctly - it's costly to unify it with ICU's behavior
+ // JS is using "final sigma" rule correctly - it's costly to unify it with ICU's behavior
yield return new object[] { cultureName, "O\u03A3", "o\u03C2" };
}
else
// RAINBOW (outside the BMP and does not case)
yield return new object[] { cultureName, "\U0001F308", "\U0001F308" };
- if (!PlatformDetection.IsHybridGlobalizationOnOSX)
- {
- // Unicode defines some codepoints which expand into multiple codepoints
- // when cased (see SpecialCasing.txt from UNIDATA for some examples). We have never done
- // these sorts of expansions, since it would cause string lengths to change when cased,
- // which is non-intuitive. In addition, there are some context sensitive mappings which
- // we also don't preform.
- // es-zed does not case to SS when uppercased.
- // on OSX, capitalizing the German letter ß (sharp S) gives SS
- yield return new object[] { cultureName, "\u00DF", "\u00DF" };
- yield return new object[] { cultureName, "stra\u00DFe", "STRA\u00DFE" };
- if (!PlatformDetection.IsNlsGlobalization)
- yield return new object[] { cultureName, "st\uD801\uDC37ra\u00DFe", "ST\uD801\uDC0FRA\u00DFE" };
-
- // Ligatures do not expand when cased.
- // on OSX, this is uppercase to "FF"
- yield return new object[] { cultureName, "\uFB00", "\uFB00" };
-
- // Precomposed character with no uppercase variant, we don't want to "decompose" this
- // as part of casing.
- // on OSX, this is uppercased to "ʼN"
- yield return new object[] { cultureName, "\u0149", "\u0149" };
- }
+
+ // Unicode defines some codepoints which expand into multiple codepoints
+ // when cased (see SpecialCasing.txt from UNIDATA for some examples). We have never done
+ // these sorts of expansions, since it would cause string lengths to change when cased,
+ // which is non-intuitive. In addition, there are some context sensitive mappings which
+ // we also don't preform.
+ // es-zed does not case to SS when uppercased.
+ yield return new object[] { cultureName, "\u00DF", "\u00DF" };
+ yield return new object[] { cultureName, "stra\u00DFe", "STRA\u00DFE" };
+ if (!PlatformDetection.IsNlsGlobalization)
+ yield return new object[] { cultureName, "st\uD801\uDC37ra\u00DFe", "ST\uD801\uDC0FRA\u00DFE" };
+
+ // Ligatures do not expand when cased.
+ yield return new object[] { cultureName, "\uFB00", "\uFB00" };
+
+ // Precomposed character with no uppercase variant, we don't want to "decompose" this
+ // as part of casing.
+ yield return new object[] { cultureName, "\u0149", "\u0149" };
}
// Turkish i
#if defined(TARGET_OSX) || defined(TARGET_MACCATALYST) || defined(TARGET_IOS) || defined(TARGET_TVOS)
+/**
+ * Is this code unit a lead surrogate (U+d800..U+dbff)?
+ * @param c 16-bit code unit
+ * @return true or false
+ */
+#define IS_LEAD(c) (((c)&0xfffffc00) == 0xd800)
+
+/**
+ * Is this code unit a trail surrogate (U+dc00..U+dfff)?
+ * @param c 16-bit code unit
+ * @return true or false
+ */
+#define IS_TRAIL(c) (((c)&0xfffffc00) == 0xdc00)
+
+/**
+ * Get a code point index from a string at a code point boundary offset,
+ * and advance the offset to the next code point boundary.
+ * (Post-incrementing forward iteration.)
+ * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
+ *
+ * The length can be negative for a NUL-terminated string.
+ *
+ * The offset may point to the lead surrogate unit
+ * for a supplementary code point, in which case for casing will be read
+ * the following trail surrogate as well.
+ * If the offset points to a trail surrogate or
+ * to a single, unpaired lead surrogate, then for casing will be read that unpaired surrogate.
+ *
+ * @param s const uint16_t* string
+ * @param i output string offset, must be i<length
+ * @param length string length
+ */
+#define NEXTOFFSET(s, i, length) { \
+ uint16_t c = (s)[(i)++]; \
+ if (IS_LEAD(c)) { \
+ uint16_t __c2; \
+ if ((i) != (length) && IS_TRAIL(__c2 = (s)[(i)])) { \
+ ++(i); \
+ } \
+ } \
+}
/**
* Append a code point to a string, overwriting 1 or 2 code units.
ChangeCaseNative
Performs upper or lower casing of a string into a new buffer, taking into account the specified locale.
+Two things we are considering here:
+1. Prohibiting code point expansions. Some characters code points expand when uppercased or lowercased, which may lead to an insufficient destination buffer.
+ Instead, we prohibit these expansions and iterate through the string character by character opting for the original character if it would have been expanded.
+2. Properly handling surrogate pairs. Characters can be comprised of more than one code point
+ (i.e. surrogate pairs like \uD801\uDC37). All code points for a character are needed to properly change case
Returns 0 for success, non-zero on failure see ErrorCodes.
*/
int32_t GlobalizationNative_ChangeCaseNative(const uint16_t* localeName, int32_t lNameLength,
NSString *locName = [NSString stringWithCharacters: localeName length: lNameLength];
currentLocale = [NSLocale localeWithLocaleIdentifier:locName];
}
- NSString *source = [NSString stringWithCharacters: lpSrc length: cwSrcLength];
- NSString *result = bToUpper ? [source uppercaseStringWithLocale:currentLocale] : [source lowercaseStringWithLocale:currentLocale];
int32_t srcIdx = 0, dstIdx = 0, isError = 0;
uint16_t dstCodepoint;
- while (srcIdx < result.length)
+ while (srcIdx < cwSrcLength)
{
- dstCodepoint = [result characterAtIndex:srcIdx++];
- Append(lpDst, dstIdx, cwDstLength, dstCodepoint, isError);
+ int32_t startIndex = srcIdx;
+ NEXTOFFSET(lpSrc, srcIdx, cwSrcLength);
+ int32_t srcLength = srcIdx - startIndex;
+ NSString *src = [NSString stringWithCharacters: lpSrc + startIndex length: srcLength];
+ NSString *dst = bToUpper ? [src uppercaseStringWithLocale:currentLocale] : [src lowercaseStringWithLocale:currentLocale];
+ int32_t index = 0;
+ // iterate over all code points of a surrogate pair character
+ while (index < srcLength)
+ {
+ // the dst.length > srcLength is to prevent code point expansions
+ dstCodepoint = dst.length > srcLength ? [src characterAtIndex: index] : [dst characterAtIndex: index];
+ Append(lpDst, dstIdx, cwDstLength, dstCodepoint, isError);
+ index++;
+ }
if (isError)
return isError;
}
ChangeCaseInvariantNative
Performs upper or lower casing of a string into a new buffer.
+Two things we are considering here:
+1. Prohibiting code point expansions. Some characters code points expand when uppercased or lowercased, which may lead to an insufficient destination buffer.
+ Instead, we prohibit these expansions and iterate through the string character by character opting for the original character if it would have been expanded.
+2. Properly handling surrogate pairs. Characters can be comprised of more than one code point
+ (i.e. surrogate pairs like \uD801\uDC37). All code points for a character are needed to properly change case
Returns 0 for success, non-zero on failure see ErrorCodes.
*/
int32_t GlobalizationNative_ChangeCaseInvariantNative(const uint16_t* lpSrc, int32_t cwSrcLength, uint16_t* lpDst, int32_t cwDstLength, int32_t bToUpper)
{
- NSString *source = [NSString stringWithCharacters: lpSrc length: cwSrcLength];
- NSString *result = bToUpper ? source.uppercaseString : source.lowercaseString;
-
int32_t srcIdx = 0, dstIdx = 0, isError = 0;
uint16_t dstCodepoint;
- while (srcIdx < result.length)
+ while (srcIdx < cwSrcLength)
{
- dstCodepoint = [result characterAtIndex:srcIdx++];
- Append(lpDst, dstIdx, cwDstLength, dstCodepoint, isError);
+ int32_t startIndex = srcIdx;
+ NEXTOFFSET(lpSrc, srcIdx, cwSrcLength);
+ int32_t srcLength = srcIdx - startIndex;
+ NSString *src = [NSString stringWithCharacters: lpSrc + startIndex length: srcLength];
+ NSString *dst = bToUpper ? src.uppercaseString : src.lowercaseString;
+ int32_t index = 0;
+ // iterate over all code points of a surrogate pair character
+ while (index < srcLength)
+ {
+ // the dst.length > srcLength is to prevent code point expansions
+ dstCodepoint = dst.length > srcLength ? [src characterAtIndex: index] : [dst characterAtIndex: index];
+ Append(lpDst, dstIdx, cwDstLength, dstCodepoint, isError);
+ index++;
+ }
if (isError)
return isError;
}