From 96e8087ae102d522995bff2c609ab4e01c9d9686 Mon Sep 17 00:00:00 2001 From: Tarek Mahmoud Sayed Date: Wed, 2 Aug 2023 08:55:49 -0700 Subject: [PATCH] Fix Globalization Invariant string comparisons with ordinal ignore casing (#89660) --- .../tests/Invariant/InvariantMode.cs | 37 +++++++++++++++ .../CharUnicodeInfoTests.Generated.cs | 53 ++++++++++++++++++++++ .../src/System/Globalization/CharUnicodeInfo.cs | 24 ++++++---- 3 files changed, 104 insertions(+), 10 deletions(-) diff --git a/src/libraries/System.Globalization/tests/Invariant/InvariantMode.cs b/src/libraries/System.Globalization/tests/Invariant/InvariantMode.cs index fd2d62a..2dd0790 100644 --- a/src/libraries/System.Globalization/tests/Invariant/InvariantMode.cs +++ b/src/libraries/System.Globalization/tests/Invariant/InvariantMode.cs @@ -1206,6 +1206,43 @@ namespace System.Globalization.Tests Assert.Equal(new CultureInfo(culture), CultureInfo.InvariantCulture); } + [Fact] + public void TestCasing() + { + // This test verifies that the casing table in the invariant globalization mode. + // The casing table is what we generate from the Unicode data files, and implemented in the CharUnicodeInfo.cs file. + + TextInfo textInfo = CultureInfo.InvariantCulture.TextInfo; + + for (int i = 0; i <= 0xFFFF; i++) + { + char ch = (char)i; + char upper = textInfo.ToUpper(ch); + char lower = textInfo.ToLower(ch); + + if (ch != upper) + { + string upperString = upper.ToString(); + string chString = ch.ToString(); + + Assert.True(chString.Equals(upperString, StringComparison.OrdinalIgnoreCase), $"Expected {(int)ch:x4} to be equal to {(int)upper:x4}."); + Assert.True(chString.IndexOf(upperString, StringComparison.OrdinalIgnoreCase) == 0, $"Expected {(int)ch:x4} exist in {(int)upper:x4}."); + Assert.True(chString.StartsWith(upperString, StringComparison.OrdinalIgnoreCase), $"Expected {(int)ch:x4} start with {(int)upper:x4}."); + } + + // String comparisons has been done using ToUpper method, it is possible the lowercased character can be mapped to a different character when it is upper cased. + if (ch != lower && textInfo.ToUpper(lower) == ch) + { + string lowerString = lower.ToString(); + string chString = ch.ToString(); + + Assert.True(chString.Equals(lowerString, StringComparison.OrdinalIgnoreCase), $"Expected {(int)ch:x4} to be equal to {(int)lower:x4}."); + Assert.True(chString.IndexOf(lowerString, StringComparison.OrdinalIgnoreCase) == 0, $"Expected {(int)ch:x4} exist in {(int)lower:x4}."); + Assert.True(chString.StartsWith(lowerString, StringComparison.OrdinalIgnoreCase), $"Expected {(int)ch:x4} start with {(int)lower:x4}."); + } + } + } + private static byte[] GetExpectedInvariantOrdinalSortKey(ReadOnlySpan input) { MemoryStream memoryStream = new MemoryStream(); diff --git a/src/libraries/System.Globalization/tests/System/Globalization/CharUnicodeInfoTests.Generated.cs b/src/libraries/System.Globalization/tests/System/Globalization/CharUnicodeInfoTests.Generated.cs index a110dd6..d47706e 100644 --- a/src/libraries/System.Globalization/tests/System/Globalization/CharUnicodeInfoTests.Generated.cs +++ b/src/libraries/System.Globalization/tests/System/Globalization/CharUnicodeInfoTests.Generated.cs @@ -2,6 +2,7 @@ // The .NET Foundation licenses this file to you under the MIT license. using System.Collections.Generic; +using System.Reflection; using System.Text.Unicode; using Xunit; using Xunit.Sdk; @@ -78,6 +79,58 @@ namespace System.Globalization.Tests } } + [Fact] + public void TestCasing() + { + Func toUpperUInt = (Func) typeof(CharUnicodeInfo) + .GetMethod("ToUpper", BindingFlags.NonPublic | System.Reflection.BindingFlags.Static, new Type[] { typeof(uint) }) + .CreateDelegate(typeof(Func)); + Func toUpperChar = (Func) typeof(CharUnicodeInfo) + .GetMethod("ToUpper", BindingFlags.NonPublic | System.Reflection.BindingFlags.Static, new Type[] { typeof(char) }) + .CreateDelegate(typeof(Func)); + Func toLowerUInt = (Func) typeof(CharUnicodeInfo) + .GetMethod("ToLower", BindingFlags.NonPublic | System.Reflection.BindingFlags.Static, new Type[] { typeof(uint) }) + .CreateDelegate(typeof(Func)); + Func toLowerChar = (Func) typeof(CharUnicodeInfo) + .GetMethod("ToLower", BindingFlags.NonPublic | System.Reflection.BindingFlags.Static, new Type[] { typeof(char) }) + .CreateDelegate(typeof(Func)); + + for (int i = 0; i <= 0xFFFF; i++) + { + if (i == 0x0130 || // We special case Turkish uppercase i + i == 0x0131 || // and Turkish lowercase i + i == 0x017f) // and LATIN SMALL LETTER LONG S + { + continue; + } + + CodePoint codePoint = UnicodeData.GetData(i); + + Assert.True(codePoint.SimpleUppercaseMapping == (int)toUpperUInt((uint)i), + $"CharUnicodeInfo.ToUpper({i:x4}) returned unexpected value. Expected: {codePoint.SimpleUppercaseMapping:x4}, Actual: {toUpperUInt((uint)i):x4}"); + + Assert.True(codePoint.SimpleUppercaseMapping == (int)toUpperChar((char)i), + $"CharUnicodeInfo.ToUpper({i:x4}) returned unexpected value. Expected: {codePoint.SimpleUppercaseMapping:x4}, Actual: {(int)toUpperChar((char)i):x4}"); + + Assert.True(codePoint.SimpleLowercaseMapping == (int)toLowerUInt((uint)i), + $"CharUnicodeInfo.ToLower({i:x4}) returned unexpected value. Expected: {codePoint.SimpleLowercaseMapping:x4}, Actual: {toLowerUInt((uint)i):x4}"); + + Assert.True(codePoint.SimpleLowercaseMapping == (int)toLowerChar((char)i), + $"CharUnicodeInfo.ToLower({i:x4}) returned unexpected value. Expected: {codePoint.SimpleLowercaseMapping:x4}, Actual: {(int)toLowerChar((char)i):x4}"); + } + + for (int i = 0x10000; i <= HIGHEST_CODE_POINT; i++) + { + CodePoint codePoint = UnicodeData.GetData(i); + + Assert.True(codePoint.SimpleUppercaseMapping == (int)toUpperUInt((uint)i), + $"CharUnicodeInfo.ToUpper({i:x4}) returned unexpected value. Expected: {codePoint.SimpleUppercaseMapping:x4}, Actual: {toUpperUInt((uint)i):x4}"); + + Assert.True(codePoint.SimpleLowercaseMapping == (int)toLowerUInt((uint)i), + $"CharUnicodeInfo.ToLower({i:x4}) returned unexpected value. Expected: {codePoint.SimpleLowercaseMapping:x4}, Actual: {toLowerUInt((uint)i):x4}"); + } + } + private static void AssertEqual(T expected, T actual, string methodName, CodePoint codePoint) { if (!EqualityComparer.Default.Equals(expected, actual)) diff --git a/src/libraries/System.Private.CoreLib/src/System/Globalization/CharUnicodeInfo.cs b/src/libraries/System.Private.CoreLib/src/System/Globalization/CharUnicodeInfo.cs index 0e391c2..5a1ce17 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Globalization/CharUnicodeInfo.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Globalization/CharUnicodeInfo.cs @@ -273,13 +273,15 @@ namespace System.Globalization nuint offset = GetCategoryCasingTableOffsetNoBoundsChecks(codePoint); - // The offset is specified in shorts: - // Get the 'ref short' corresponding to where the addend is, read it as a signed 16-bit value, then add + // The mapped casing for the codePoint usually exists in the same plane as codePoint. + // This is why we use 16-bit offsets to calculate the delta value from the codePoint. - ref short rsStart = ref Unsafe.As(ref MemoryMarshal.GetReference(UppercaseValues)); - ref short rsDelta = ref Unsafe.Add(ref rsStart, (nint)offset); + ref ushort rsStart = ref Unsafe.As(ref MemoryMarshal.GetReference(UppercaseValues)); + ref ushort rsDelta = ref Unsafe.Add(ref rsStart, (nint)offset); int delta = (BitConverter.IsLittleEndian) ? rsDelta : BinaryPrimitives.ReverseEndianness(rsDelta); - return (uint)delta + codePoint; + + // We use the mask 0xFFFF0000u as we are sure the casing is in the same plane as codePoint. + return (codePoint & 0xFFFF0000u) | (ushort)((uint)delta + codePoint); } [MethodImpl(MethodImplOptions.AggressiveInlining)] @@ -306,13 +308,15 @@ namespace System.Globalization nuint offset = GetCategoryCasingTableOffsetNoBoundsChecks(codePoint); - // If the offset is specified in shorts: - // Get the 'ref short' corresponding to where the addend is, read it as a signed 16-bit value, then add + // The mapped casing for the codePoint usually exists in the same plane as codePoint. + // This is why we use 16-bit offsets to calculate the delta value from the codePoint. - ref short rsStart = ref Unsafe.As(ref MemoryMarshal.GetReference(LowercaseValues)); - ref short rsDelta = ref Unsafe.Add(ref rsStart, (nint)offset); + ref ushort rsStart = ref Unsafe.As(ref MemoryMarshal.GetReference(LowercaseValues)); + ref ushort rsDelta = ref Unsafe.Add(ref rsStart, (nint)offset); int delta = (BitConverter.IsLittleEndian) ? rsDelta : BinaryPrimitives.ReverseEndianness(rsDelta); - return (uint)delta + codePoint; + + // We use the mask 0xFFFF0000u as we are sure the casing is in the same plane as codePoint. + return (codePoint & 0xFFFF0000u) | (ushort)((uint)delta + codePoint); } /* -- 2.7.4