From 8154757175ead0854b9cf914bff87e03d352a4ce Mon Sep 17 00:00:00 2001 From: Ilona Tomkowicz <32700855+ilonatommy@users.noreply.github.com> Date: Tue, 30 May 2023 08:54:51 +0200 Subject: [PATCH] [wasm][globalization] `HybridGlobalization` fix bug in change case (#86799) * Fix + test. * Fix surrogates problem, document final sigma. * Update change-case.ts * Fix NLS --- docs/design/features/hybrid-globalization.md | 5 + .../tests/System/Globalization/TextInfoTests.cs | 16 ++- .../runtime/hybrid-globalization/change-case.ts | 143 +++++++++++++++++++-- 3 files changed, 152 insertions(+), 12 deletions(-) diff --git a/docs/design/features/hybrid-globalization.md b/docs/design/features/hybrid-globalization.md index 916bada..5d08676 100644 --- a/docs/design/features/hybrid-globalization.md +++ b/docs/design/features/hybrid-globalization.md @@ -27,6 +27,11 @@ Affected public APIs: - TextInfo.ToTitleCase. Case change with invariant culture uses `toUpperCase` / `toLoweCase` functions that do not guarantee a full match with the original invariant culture. +Hybrid case change, same as ICU-based, does not support code points expansion e.g. "straße" -> "STRAßE". + +- Final sigma behavior correction: + +ICU-based case change does not respect final-sigma rule, but hybrid does, so "ΒΌΛΟΣ" -> "βόλος", not "βόλοσ". **String comparison** diff --git a/src/libraries/System.Globalization/tests/System/Globalization/TextInfoTests.cs b/src/libraries/System.Globalization/tests/System/Globalization/TextInfoTests.cs index 11b4dd5..fcc4c53 100644 --- a/src/libraries/System.Globalization/tests/System/Globalization/TextInfoTests.cs +++ b/src/libraries/System.Globalization/tests/System/Globalization/TextInfoTests.cs @@ -272,8 +272,17 @@ namespace System.Globalization.Tests // these sorts of expansions, since it would cause string lengths to change when cased, // which is non-intuitive. In addition, there are some context sensitive mappings which // we also don't preform. - // Greek Capital Letter Sigma (does not to case to U+03C2 with "final sigma" rule). + // Greek Capital Letter Sigma (does not case to U+03C2 with "final sigma" rule). yield return new object[] { cultureName, "\u03A3", "\u03C3" }; + if (PlatformDetection.IsHybridGlobalizationOnBrowser) + { + // JS is using "final sigma" rule correctly - it's costly to unify it with ICU's behavior + yield return new object[] { cultureName, "O\u03A3", "o\u03C2" }; + } + else + { + yield return new object[] { cultureName, "O\u03A3", "o\u03C3" }; + } } foreach (string cultureName in GetTestLocales()) @@ -393,7 +402,10 @@ namespace System.Globalization.Tests // which is non-intuitive. In addition, there are some context sensitive mappings which // we also don't preform. // es-zed does not case to SS when uppercased. - yield return new object[] { cultureName, "\u00DF", "\u00DF" }; + yield return new object[] { cultureName, "\u00DF", "\u00DF" }; + yield return new object[] { cultureName, "stra\u00DFe", "STRA\u00DFE" }; + if (!PlatformDetection.IsNlsGlobalization) + yield return new object[] { cultureName, "st\uD801\uDC37ra\u00DFe", "ST\uD801\uDC0FRA\u00DFE" }; // Ligatures do not expand when cased. yield return new object[] { cultureName, "\uFB00", "\uFB00" }; diff --git a/src/mono/wasm/runtime/hybrid-globalization/change-case.ts b/src/mono/wasm/runtime/hybrid-globalization/change-case.ts index 6472f81..ea6fa2d 100644 --- a/src/mono/wasm/runtime/hybrid-globalization/change-case.ts +++ b/src/mono/wasm/runtime/hybrid-globalization/change-case.ts @@ -6,18 +6,75 @@ import { monoStringToString, utf16ToStringLoop, stringToUTF16 } from "../strings import { MonoObject, MonoObjectRef, MonoString, MonoStringRef } from "../types/internal"; import { Int32Ptr } from "../types/emscripten"; import { wrap_error_root, wrap_no_error_root } from "../invoke-js"; +import { localHeapViewU16, setU16_local } from "../memory"; + +const SURROGATE_HIGHER_START = "\uD800"; +const SURROGATE_HIGHER_END = "\uDBFF"; +const SURROGATE_LOWER_START = "\uDC00"; +const SURROGATE_LOWER_END = "\uDFFF"; export function mono_wasm_change_case_invariant(src: number, srcLength: number, dst: number, dstLength: number, toUpper: number, is_exception: Int32Ptr, ex_address: MonoObjectRef): void { const exceptionRoot = mono_wasm_new_external_root(ex_address); try { const input = utf16ToStringLoop(src, src + 2 * srcLength); - let result = toUpper ? input.toUpperCase() : input.toLowerCase(); + const result = toUpper ? input.toUpperCase() : input.toLowerCase(); // Unicode defines some codepoints which expand into multiple codepoints, // originally we do not support this expansion - if (result.length > dstLength) - result = input; - stringToUTF16(dst, dst + 2 * dstLength, result); - wrap_no_error_root(is_exception, exceptionRoot); + if (result.length <= dstLength) + { + stringToUTF16(dst, dst + 2 * dstLength, result); + wrap_no_error_root(is_exception, exceptionRoot); + return; + } + + // workaround to maintain the ICU-like behavior + const heapI16 = localHeapViewU16(); + let jump = 1; + if (toUpper) + { + for (let i=0; i < input.length; i+=jump) + { + // surrogate parts have to enter ToUpper/ToLower together to give correct output + if (isSurrogate(input, i)) + { + jump = 2; + const surrogate = input.substring(i, i+2); + const upperSurrogate = surrogate.toUpperCase(); + const appendedSurrogate = upperSurrogate.length > 2 ? surrogate : upperSurrogate; + appendSurrogateToMemory(heapI16, dst, appendedSurrogate, i); + + } + else + { + jump = 1; + const upperChar = input[i].toUpperCase(); + const appendedChar = upperChar.length > 1 ? input[i] : upperChar; + setU16_local(heapI16, dst + i*2, appendedChar.charCodeAt(0)); + } + } + } + else + { + for (let i=0; i < input.length; i+=jump) + { + if (isSurrogate(input, i)) + { + jump = 2; + const surrogate = input.substring(i, i+2); + const upperSurrogate = surrogate.toLowerCase(); + const appendedSurrogate = upperSurrogate.length > 2 ? surrogate : upperSurrogate; + appendSurrogateToMemory(heapI16, dst, appendedSurrogate, i); + + } + else + { + jump = 1; + const upperChar = input[i].toLowerCase(); + const appendedChar = upperChar.length > 1 ? input[i] : upperChar; + setU16_local(heapI16, dst + i*2, appendedChar.charCodeAt(0)); + } + } + } } catch (ex: any) { wrap_error_root(is_exception, ex, exceptionRoot); @@ -35,11 +92,62 @@ export function mono_wasm_change_case(culture: MonoStringRef, src: number, srcLe if (!cultureName) throw new Error("Cannot change case, the culture name is null."); const input = utf16ToStringLoop(src, src + 2 * srcLength); - let result = toUpper ? input.toLocaleUpperCase(cultureName) : input.toLocaleLowerCase(cultureName); - if (result.length > dstLength) - result = input; + const result = toUpper ? input.toLocaleUpperCase(cultureName) : input.toLocaleLowerCase(cultureName); + + if (result.length <= input.length) + { + stringToUTF16(dst, dst + 2 * dstLength, result); + wrap_no_error_root(is_exception, exceptionRoot); + return; + } + // workaround to maintain the ICU-like behavior + const heapI16 = localHeapViewU16(); + let jump = 1; + if (toUpper) + { + for (let i=0; i < input.length; i+=jump) + { + // surrogate parts have to enter ToUpper/ToLower together to give correct output + if (isSurrogate(input, i)) + { + jump = 2; + const surrogate = input.substring(i, i+2); + const upperSurrogate = surrogate.toLocaleUpperCase(cultureName); + const appendedSurrogate = upperSurrogate.length > 2 ? surrogate : upperSurrogate; + appendSurrogateToMemory(heapI16, dst, appendedSurrogate, i); - stringToUTF16(dst, dst + 2 * dstLength, result); + } + else + { + jump = 1; + const upperChar = input[i].toLocaleUpperCase(cultureName); + const appendedChar = upperChar.length > 1 ? input[i] : upperChar; + setU16_local(heapI16, dst + i*2, appendedChar.charCodeAt(0)); + } + } + } + else + { + for (let i=0; i < input.length; i+=jump) + { + // surrogate parts have to enter ToUpper/ToLower together to give correct output + if (isSurrogate(input, i)) + { + jump = 2; + const surrogate = input.substring(i, i+2); + const upperSurrogate = surrogate.toLocaleLowerCase(cultureName); + const appendedSurrogate = upperSurrogate.length > 2 ? surrogate : upperSurrogate; + appendSurrogateToMemory(heapI16, dst, appendedSurrogate, i); + } + else + { + jump = 1; + const lowerChar = input[i].toLocaleLowerCase(cultureName); + const appendedChar = lowerChar.length > 1 ? input[i] : lowerChar; + setU16_local(heapI16, dst + i*2, appendedChar.charCodeAt(0)); + } + } + } wrap_no_error_root(is_exception, exceptionRoot); } catch (ex: any) { @@ -49,4 +157,19 @@ export function mono_wasm_change_case(culture: MonoStringRef, src: number, srcLe cultureRoot.release(); exceptionRoot.release(); } -} \ No newline at end of file +} + +function isSurrogate(str: string, startIdx: number) : boolean +{ + return SURROGATE_HIGHER_START <= str[startIdx] && + str[startIdx] <= SURROGATE_HIGHER_END && + startIdx+1 < str.length && + SURROGATE_LOWER_START <= str[startIdx+1] && + str[startIdx+1] <= SURROGATE_LOWER_END; +} + +function appendSurrogateToMemory(heapI16: Uint16Array, dst: number, surrogate: string, idx: number) +{ + setU16_local(heapI16, dst + idx*2, surrogate.charCodeAt(0)); + setU16_local(heapI16, dst + (idx+1)*2, surrogate.charCodeAt(1)); +} -- 2.7.4