From 91c1250176b6757419b4c3d5149dec18a773eb2a Mon Sep 17 00:00:00 2001 From: Tarek Mahmoud Sayed Date: Tue, 17 Aug 2021 13:41:34 -0700 Subject: [PATCH] Fix the string search behavior when using ICU (#57078) --- .../System.Globalization.Native/pal_collation.c | 171 ++++++++++++++++++++- .../pal_icushim_internal.h | 10 +- .../pal_icushim_internal_android.h | 6 +- .../System.Globalization.Native/pal_timeZoneInfo.c | 2 +- .../tests/CompareInfo/CompareInfoTests.IndexOf.cs | 1 + .../CompareInfo/CompareInfoTests.LastIndexOf.cs | 1 + 6 files changed, 178 insertions(+), 13 deletions(-) diff --git a/src/libraries/Native/Unix/System.Globalization.Native/pal_collation.c b/src/libraries/Native/Unix/System.Globalization.Native/pal_collation.c index 0e0a46c..4a4c969 100644 --- a/src/libraries/Native/Unix/System.Globalization.Native/pal_collation.c +++ b/src/libraries/Native/Unix/System.Globalization.Native/pal_collation.c @@ -417,7 +417,7 @@ static int CanIgnoreAllCollationElements(const UCollator* pColl, const UChar* lp static void CreateSortHandle(SortHandle** ppSortHandle) { - *ppSortHandle = (SortHandle*)malloc(sizeof(SortHandle)); + *ppSortHandle = (SortHandle*)calloc(1, sizeof(SortHandle)); if ((*ppSortHandle) == NULL) { return; @@ -449,6 +449,150 @@ ResultCode GlobalizationNative_GetSortHandle(const char* lpLocaleName, SortHandl return GetResultCode(err); } +static const char* BreakIteratorRuleOld = // supported on ICU like versions 52 + "$CR = [\\p{Grapheme_Cluster_Break = CR}]; \n" \ + "$LF = [\\p{Grapheme_Cluster_Break = LF}]; \n" \ + "$Control = [\\p{Grapheme_Cluster_Break = Control}]; \n" \ + "$Extend = [\\p{Grapheme_Cluster_Break = Extend}]; \n" \ + "$SpacingMark = [\\p{Grapheme_Cluster_Break = SpacingMark}]; \n" \ + "$Regional_Indicator = [\\p{Grapheme_Cluster_Break = Regional_Indicator}]; \n" \ + "$L = [\\p{Grapheme_Cluster_Break = L}]; \n" \ + "$V = [\\p{Grapheme_Cluster_Break = V}]; \n" \ + "$T = [\\p{Grapheme_Cluster_Break = T}]; \n" \ + "$LV = [\\p{Grapheme_Cluster_Break = LV}]; \n" \ + "$LVT = [\\p{Grapheme_Cluster_Break = LVT}]; \n" \ + "!!chain; \n" \ + "!!forward; \n" \ + "$L ($L | $V | $LV | $LVT); \n" \ + "($LV | $V) ($V | $T); \n" \ + "($LVT | $T) $T; \n" \ + "$Regional_Indicator $Regional_Indicator; \n" \ + "[^$Control $CR $LF] $Extend; \n" \ + "[^$Control $CR $LF] $SpacingMark; \n" \ + "!!reverse; \n" \ + "($L | $V | $LV | $LVT) $L; \n" \ + "($V | $T) ($LV | $V); \n" \ + "$T ($LVT | $T); \n" \ + "$Regional_Indicator $Regional_Indicator; \n" \ + "$Extend [^$Control $CR $LF]; \n" \ + "$SpacingMark [^$Control $CR $LF]; \n" \ + "!!safe_reverse; \n" \ + "!!safe_forward; \n"; + +static const char* BreakIteratorRuleNew = // supported on newer ICU versions like 62 and up + "!!quoted_literals_only; \n" \ + "$CR = [\\p{Grapheme_Cluster_Break = CR}]; \n" \ + "$LF = [\\p{Grapheme_Cluster_Break = LF}]; \n" \ + "$Control = [[\\p{Grapheme_Cluster_Break = Control}]]; \n" \ + "$Extend = [[\\p{Grapheme_Cluster_Break = Extend}]]; \n" \ + "$ZWJ = [\\p{Grapheme_Cluster_Break = ZWJ}]; \n" \ + "$Regional_Indicator = [\\p{Grapheme_Cluster_Break = Regional_Indicator}]; \n" \ + "$Prepend = [\\p{Grapheme_Cluster_Break = Prepend}]; \n" \ + "$SpacingMark = [\\p{Grapheme_Cluster_Break = SpacingMark}]; \n" \ + "$Virama = [\\p{Gujr}\\p{sc=Telu}\\p{sc=Mlym}\\p{sc=Orya}\\p{sc=Beng}\\p{sc=Deva}&\\p{Indic_Syllabic_Category=Virama}]; \n" \ + "$LinkingConsonant = [\\p{Gujr}\\p{sc=Telu}\\p{sc=Mlym}\\p{sc=Orya}\\p{sc=Beng}\\p{sc=Deva}&\\p{Indic_Syllabic_Category=Consonant}]; \n" \ + "$ExtCccZwj = [[\\p{gcb=Extend}-\\p{ccc=0}] \\p{gcb=ZWJ}]; \n" \ + "$L = [\\p{Grapheme_Cluster_Break = L}]; \n" \ + "$V = [\\p{Grapheme_Cluster_Break = V}]; \n" \ + "$T = [\\p{Grapheme_Cluster_Break = T}]; \n" \ + "$LV = [\\p{Grapheme_Cluster_Break = LV}]; \n" \ + "$LVT = [\\p{Grapheme_Cluster_Break = LVT}]; \n" \ + "$Extended_Pict = [:ExtPict:]; \n" \ + "!!chain; \n" \ + "!!lookAheadHardBreak; \n" \ + "$L ($L | $V | $LV | $LVT); \n" \ + "($LV | $V) ($V | $T); \n" \ + "($LVT | $T) $T; \n" \ + "[^$Control $CR $LF] ($Extend | $ZWJ); \n" \ + "[^$Control $CR $LF] $SpacingMark; \n" \ + "$Prepend [^$Control $CR $LF]; \n" \ + "$LinkingConsonant $ExtCccZwj* $Virama $ExtCccZwj* $LinkingConsonant; \n" \ + "$Extended_Pict $Extend* $ZWJ $Extended_Pict; \n" \ + "^$Prepend* $Regional_Indicator $Regional_Indicator / $Regional_Indicator; \n" \ + "^$Prepend* $Regional_Indicator $Regional_Indicator; \n" \ + ".;"; + +static UChar* s_breakIteratorRules = NULL; + +// When doing string search operations using ICU, it is internally using a break iterator which doesn't allow breaking between some characters according to +// the Grapheme Cluster Boundary Rules specified in http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules. +// Unfortunately, not all rules will have the desired behavior we need to get in .NET. For example, the rules don't allow breaking between CR '\r' and LF '\n' characters. +// When searching for "\n" in a string like "\r\n", will get not found result. +// We are customizing the break iterator to exclude the CRxLF rule which don't allow breaking between CR and LF. +// The general rules syntax explained in the doc https://unicode-org.github.io/icu/userguide/boundaryanalysis/break-rules.html. +// The ICU latest rules definition exist here https://github.com/unicode-org/icu/blob/main/icu4c/source/data/brkitr/rules/char.txt. +static UBreakIterator* CreateCustomizedBreakIterator() +{ + static UChar emptyString[1]; + UBreakIterator* breaker; + + UErrorCode status = U_ZERO_ERROR; + if (s_breakIteratorRules != NULL) + { + breaker = ubrk_openRules(s_breakIteratorRules, -1, emptyString, 0, NULL, &status); + return U_FAILURE(status) ? NULL : breaker; + } + + int32_t oldRulesLength = (int32_t)strlen(BreakIteratorRuleOld); + int32_t newRulesLength = (int32_t)strlen(BreakIteratorRuleNew); + + int32_t breakIteratorRulesLength = newRulesLength > oldRulesLength ? newRulesLength : oldRulesLength; + + UChar* rules = (UChar*)calloc((size_t)breakIteratorRulesLength + 1, sizeof(UChar)); + if (rules == NULL) + { + return NULL; + } + + u_uastrncpy(rules, BreakIteratorRuleNew, newRulesLength); + rules[newRulesLength] = '\0'; + + breaker = ubrk_openRules(rules, newRulesLength, emptyString, 0, NULL, &status); + if (U_FAILURE(status)) + { + status = U_ZERO_ERROR; + u_uastrncpy(rules, BreakIteratorRuleOld, oldRulesLength); + rules[oldRulesLength] = '\0'; + breaker = ubrk_openRules(rules, oldRulesLength, emptyString, 0, NULL, &status); + } + + if (U_FAILURE(status)) + { + free(rules); + return NULL; + } + + UChar* pNull = NULL; + if (!pal_atomic_cas_ptr((void* volatile*)&s_breakIteratorRules, rules, pNull)) + { + free(rules); + assert(s_breakIteratorRules != NULL); + } + + return breaker; +} + +static void CloseSearchIterator(UStringSearch* pSearch) +{ + assert(pSearch != NULL); + +#if !defined(TARGET_WINDOWS) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wcast-qual" +#endif + UBreakIterator* breakIterator = (UBreakIterator*)usearch_getBreakIterator(pSearch); +#if !defined(TARGET_WINDOWS) +#pragma GCC diagnostic pop +#endif + + usearch_close(pSearch); + + if (breakIterator != NULL) + { + ubrk_close(breakIterator); + } +} + void GlobalizationNative_CloseSortHandle(SortHandle* pSortHandle) { for (int i = 0; i <= CompareOptionsMask; i++) @@ -460,7 +604,7 @@ void GlobalizationNative_CloseSortHandle(SortHandle* pSortHandle) { if (pSearch != USED_STRING_SEARCH) { - usearch_close(pSearch); + CloseSearchIterator(pSearch); } pSortHandle->searchIteratorList[i].searchIterator = NULL; SearchIteratorNode* pNext = pSortHandle->searchIteratorList[i].next; @@ -470,7 +614,7 @@ void GlobalizationNative_CloseSortHandle(SortHandle* pSortHandle) { if (pNext->searchIterator != NULL && pNext->searchIterator != USED_STRING_SEARCH) { - usearch_close(pNext->searchIterator); + CloseSearchIterator(pNext->searchIterator); } SearchIteratorNode* pCurrent = pNext; pNext = pCurrent->next; @@ -518,7 +662,7 @@ static const UCollator* GetCollatorFromSortHandle(SortHandle* pSortHandle, int32 // CreateNewSearchNode will create a new node in the linked list and mark this node search handle as borrowed handle. static inline int32_t CreateNewSearchNode(SortHandle* pSortHandle, int32_t options) { - SearchIteratorNode* node = (SearchIteratorNode*) malloc(sizeof(SearchIteratorNode)); + SearchIteratorNode* node = (SearchIteratorNode*)calloc(1, sizeof(SearchIteratorNode)); if (node == NULL) { return false; @@ -581,9 +725,14 @@ static int32_t GetSearchIteratorUsingCollator( if (*pSearchIterator == NULL) { - *pSearchIterator = usearch_openFromCollator(lpTarget, cwTargetLength, lpSource, cwSourceLength, pColl, NULL, &err); + UBreakIterator* breakIterator = CreateCustomizedBreakIterator(); + *pSearchIterator = usearch_openFromCollator(lpTarget, cwTargetLength, lpSource, cwSourceLength, pColl, breakIterator, &err); if (!U_SUCCESS(err)) { + if (breakIterator != NULL) + { + ubrk_close(breakIterator); + } assert(false && "Couldn't open the search iterator."); return -1; } @@ -593,7 +742,7 @@ static int32_t GetSearchIteratorUsingCollator( { if (!CreateNewSearchNode(pSortHandle, options)) { - usearch_close(*pSearchIterator); + CloseSearchIterator(*pSearchIterator); return -1; } } @@ -619,16 +768,22 @@ static int32_t GetSearchIteratorUsingCollator( if (*pSearchIterator == NULL) // Couldn't find any available handle to borrow then create a new one. { - *pSearchIterator = usearch_openFromCollator(lpTarget, cwTargetLength, lpSource, cwSourceLength, pColl, NULL, &err); + UBreakIterator* breakIterator = CreateCustomizedBreakIterator(); + *pSearchIterator = usearch_openFromCollator(lpTarget, cwTargetLength, lpSource, cwSourceLength, pColl, breakIterator, &err); if (!U_SUCCESS(err)) { + if (breakIterator != NULL) + { + ubrk_close(breakIterator); + } + assert(false && "Couldn't open a new search iterator."); return -1; } if (!CreateNewSearchNode(pSortHandle, options)) { - usearch_close(*pSearchIterator); + CloseSearchIterator(*pSearchIterator); return -1; } diff --git a/src/libraries/Native/Unix/System.Globalization.Native/pal_icushim_internal.h b/src/libraries/Native/Unix/System.Globalization.Native/pal_icushim_internal.h index b70e31f..92ded52 100644 --- a/src/libraries/Native/Unix/System.Globalization.Native/pal_icushim_internal.h +++ b/src/libraries/Native/Unix/System.Globalization.Native/pal_icushim_internal.h @@ -72,7 +72,9 @@ U_CAPI int32_t U_EXPORT2 ucal_getWindowsTimeZoneID(const UChar* id, int32_t len, PER_FUNCTION_BLOCK(u_strncpy, libicuuc, true) \ PER_FUNCTION_BLOCK(u_tolower, libicuuc, true) \ PER_FUNCTION_BLOCK(u_toupper, libicuuc, true) \ - PER_FUNCTION_BLOCK(u_uastrcpy, libicuuc, true) \ + PER_FUNCTION_BLOCK(u_uastrncpy, libicuuc, true) \ + PER_FUNCTION_BLOCK(ubrk_close, libicui18n, true) \ + PER_FUNCTION_BLOCK(ubrk_openRules, libicui18n, true) \ PER_FUNCTION_BLOCK(ucal_add, libicui18n, true) \ PER_FUNCTION_BLOCK(ucal_close, libicui18n, true) \ PER_FUNCTION_BLOCK(ucal_get, libicui18n, true) \ @@ -155,6 +157,7 @@ U_CAPI int32_t U_EXPORT2 ucal_getWindowsTimeZoneID(const UChar* id, int32_t len, PER_FUNCTION_BLOCK(ures_open, libicuuc, true) \ PER_FUNCTION_BLOCK(usearch_close, libicui18n, true) \ PER_FUNCTION_BLOCK(usearch_first, libicui18n, true) \ + PER_FUNCTION_BLOCK(usearch_getBreakIterator, libicui18n, true) \ PER_FUNCTION_BLOCK(usearch_getMatchedLength, libicui18n, true) \ PER_FUNCTION_BLOCK(usearch_last, libicui18n, true) \ PER_FUNCTION_BLOCK(usearch_openFromCollator, libicui18n, true) \ @@ -215,7 +218,9 @@ FOR_ALL_ICU_FUNCTIONS #define u_strncpy(...) u_strncpy_ptr(__VA_ARGS__) #define u_tolower(...) u_tolower_ptr(__VA_ARGS__) #define u_toupper(...) u_toupper_ptr(__VA_ARGS__) -#define u_uastrcpy(...) u_uastrcpy_ptr(__VA_ARGS__) +#define u_uastrncpy(...) u_uastrncpy_ptr(__VA_ARGS__) +#define ubrk_close(...) ubrk_close_ptr(__VA_ARGS__) +#define ubrk_openRules(...) ubrk_openRules_ptr(__VA_ARGS__) #define ucal_add(...) ucal_add_ptr(__VA_ARGS__) #define ucal_close(...) ucal_close_ptr(__VA_ARGS__) #define ucal_get(...) ucal_get_ptr(__VA_ARGS__) @@ -310,6 +315,7 @@ FOR_ALL_ICU_FUNCTIONS #define ures_open(...) ures_open_ptr(__VA_ARGS__) #define usearch_close(...) usearch_close_ptr(__VA_ARGS__) #define usearch_first(...) usearch_first_ptr(__VA_ARGS__) +#define usearch_getBreakIterator(...) usearch_getBreakIterator_ptr(__VA_ARGS__) #define usearch_getMatchedLength(...) usearch_getMatchedLength_ptr(__VA_ARGS__) #define usearch_last(...) usearch_last_ptr(__VA_ARGS__) #define usearch_openFromCollator(...) usearch_openFromCollator_ptr(__VA_ARGS__) diff --git a/src/libraries/Native/Unix/System.Globalization.Native/pal_icushim_internal_android.h b/src/libraries/Native/Unix/System.Globalization.Native/pal_icushim_internal_android.h index 4439787..1125ce9 100644 --- a/src/libraries/Native/Unix/System.Globalization.Native/pal_icushim_internal_android.h +++ b/src/libraries/Native/Unix/System.Globalization.Native/pal_icushim_internal_android.h @@ -433,7 +433,6 @@ typedef struct UFieldPosition { int32_t endIndex; } UFieldPosition; - void u_charsToUChars(const char * cs, UChar * us, int32_t length); void u_getVersion(UVersionInfo versionArray); int32_t u_strlen(const UChar * s); @@ -442,7 +441,9 @@ UChar * u_strcpy(UChar * dst, const UChar * src); UChar * u_strncpy(UChar * dst, const UChar * src, int32_t n); UChar32 u_tolower(UChar32 c); UChar32 u_toupper(UChar32 c); -UChar* u_uastrcpy(UChar * dst, const char * src); +UChar* u_uastrncpy(UChar * dst, const char * src, int32_t n); +void ubrk_close(UBreakIterator * bi); +UBreakIterator* ubrk_openRules(const UChar * rules, int32_t rulesLength, const UChar * text, int32_t textLength, UParseError * parseErr, UErrorCode * status); void ucal_add(UCalendar * cal, UCalendarDateFields field, int32_t amount, UErrorCode * status); void ucal_close(UCalendar * cal); int32_t ucal_get(const UCalendar * cal, UCalendarDateFields field, UErrorCode * status); @@ -532,6 +533,7 @@ const UChar * ures_getStringByIndex(const UResourceBundle * resourceBundle, int3 UResourceBundle * ures_open(const char * packageName, const char * locale, UErrorCode * status); void usearch_close(UStringSearch * searchiter); int32_t usearch_first(UStringSearch * strsrch, UErrorCode * status); +const UBreakIterator* usearch_getBreakIterator(const UStringSearch * strsrch); int32_t usearch_getMatchedLength(const UStringSearch * strsrch); int32_t usearch_last(UStringSearch * strsrch, UErrorCode * status); UStringSearch * usearch_openFromCollator(const UChar * pattern, int32_t patternlength, const UChar * text, int32_t textlength, const UCollator * collator, UBreakIterator * breakiter, UErrorCode * status); diff --git a/src/libraries/Native/Unix/System.Globalization.Native/pal_timeZoneInfo.c b/src/libraries/Native/Unix/System.Globalization.Native/pal_timeZoneInfo.c index 2620c14..018746e 100644 --- a/src/libraries/Native/Unix/System.Globalization.Native/pal_timeZoneInfo.c +++ b/src/libraries/Native/Unix/System.Globalization.Native/pal_timeZoneInfo.c @@ -223,7 +223,7 @@ static void FixupTimeZoneGenericDisplayName(const char* locale, const UChar* tim } // Make a UChar[] version of the test time zone id for use in the API calls. - u_uastrcpy(testTimeZoneId, testId); + u_uastrncpy(testTimeZoneId, testId, TZID_LENGTH); // Get the standard name from the test time zone. GetTimeZoneDisplayName_FromCalendar(locale, testTimeZoneId, timestamp, UCAL_STANDARD, testDisplayName, DISPLAY_NAME_LENGTH, err); diff --git a/src/libraries/System.Globalization/tests/CompareInfo/CompareInfoTests.IndexOf.cs b/src/libraries/System.Globalization/tests/CompareInfo/CompareInfoTests.IndexOf.cs index 960db32..8c19567 100644 --- a/src/libraries/System.Globalization/tests/CompareInfo/CompareInfoTests.IndexOf.cs +++ b/src/libraries/System.Globalization/tests/CompareInfo/CompareInfoTests.IndexOf.cs @@ -72,6 +72,7 @@ namespace System.Globalization.Tests yield return new object[] { s_invariantCompare, "FooBar", "Foo\u0400Bar", 0, 6, CompareOptions.Ordinal, -1, 0 }; yield return new object[] { s_invariantCompare, "TestFooBA\u0300R", "FooB\u00C0R", 0, 11, CompareOptions.IgnoreNonSpace, 4, 7 }; yield return new object[] { s_invariantCompare, "o\u0308", "o", 0, 2, CompareOptions.None, -1, 0 }; + yield return new object[] { s_invariantCompare, "\r\n", "\n", 0, 2, CompareOptions.None, 1, 1 }; // Weightless characters yield return new object[] { s_invariantCompare, "", "\u200d", 0, 0, CompareOptions.None, 0, 0 }; diff --git a/src/libraries/System.Globalization/tests/CompareInfo/CompareInfoTests.LastIndexOf.cs b/src/libraries/System.Globalization/tests/CompareInfo/CompareInfoTests.LastIndexOf.cs index a400ded..e5bd894 100644 --- a/src/libraries/System.Globalization/tests/CompareInfo/CompareInfoTests.LastIndexOf.cs +++ b/src/libraries/System.Globalization/tests/CompareInfo/CompareInfoTests.LastIndexOf.cs @@ -86,6 +86,7 @@ namespace System.Globalization.Tests yield return new object[] { s_invariantCompare, "FooBar", "Foo\u0400Bar", 5, 6, CompareOptions.Ordinal, -1, 0 }; yield return new object[] { s_invariantCompare, "TestFooBA\u0300R", "FooB\u00C0R", 10, 11, CompareOptions.IgnoreNonSpace, 4, 7 }; yield return new object[] { s_invariantCompare, "o\u0308", "o", 1, 2, CompareOptions.None, -1, 0 }; + yield return new object[] { s_invariantCompare, "\r\n", "\n", 1, 2, CompareOptions.None, 1, 1 }; // Weightless characters // NLS matches weightless characters at the end of the string -- 2.7.4