Fix the string search behavior when using ICU (#57078)
authorTarek Mahmoud Sayed <tarekms@microsoft.com>
Tue, 17 Aug 2021 20:41:34 +0000 (13:41 -0700)
committerGitHub <noreply@github.com>
Tue, 17 Aug 2021 20:41:34 +0000 (13:41 -0700)
src/libraries/Native/Unix/System.Globalization.Native/pal_collation.c
src/libraries/Native/Unix/System.Globalization.Native/pal_icushim_internal.h
src/libraries/Native/Unix/System.Globalization.Native/pal_icushim_internal_android.h
src/libraries/Native/Unix/System.Globalization.Native/pal_timeZoneInfo.c
src/libraries/System.Globalization/tests/CompareInfo/CompareInfoTests.IndexOf.cs
src/libraries/System.Globalization/tests/CompareInfo/CompareInfoTests.LastIndexOf.cs

index 0e0a46c..4a4c969 100644 (file)
@@ -417,7 +417,7 @@ static int CanIgnoreAllCollationElements(const UCollator* pColl, const UChar* lp
 
 static void CreateSortHandle(SortHandle** ppSortHandle)
 {
-    *ppSortHandle = (SortHandle*)malloc(sizeof(SortHandle));
+    *ppSortHandle = (SortHandle*)calloc(1, sizeof(SortHandle));
     if ((*ppSortHandle) == NULL)
     {
         return;
@@ -449,6 +449,150 @@ ResultCode GlobalizationNative_GetSortHandle(const char* lpLocaleName, SortHandl
     return GetResultCode(err);
 }
 
+static const char* BreakIteratorRuleOld =  // supported on ICU like versions 52
+                        "$CR          = [\\p{Grapheme_Cluster_Break = CR}]; \n" \
+                        "$LF          = [\\p{Grapheme_Cluster_Break = LF}]; \n" \
+                        "$Control     = [\\p{Grapheme_Cluster_Break = Control}]; \n" \
+                        "$Extend      = [\\p{Grapheme_Cluster_Break = Extend}]; \n" \
+                        "$SpacingMark = [\\p{Grapheme_Cluster_Break = SpacingMark}]; \n" \
+                        "$Regional_Indicator = [\\p{Grapheme_Cluster_Break = Regional_Indicator}]; \n" \
+                        "$L       = [\\p{Grapheme_Cluster_Break = L}]; \n" \
+                        "$V       = [\\p{Grapheme_Cluster_Break = V}]; \n" \
+                        "$T       = [\\p{Grapheme_Cluster_Break = T}]; \n" \
+                        "$LV      = [\\p{Grapheme_Cluster_Break = LV}]; \n" \
+                        "$LVT     = [\\p{Grapheme_Cluster_Break = LVT}]; \n" \
+                        "!!chain; \n" \
+                        "!!forward; \n" \
+                        "$L ($L | $V | $LV | $LVT); \n" \
+                        "($LV | $V) ($V | $T); \n" \
+                        "($LVT | $T) $T; \n" \
+                        "$Regional_Indicator $Regional_Indicator; \n" \
+                        "[^$Control $CR $LF] $Extend; \n" \
+                        "[^$Control $CR $LF] $SpacingMark; \n" \
+                        "!!reverse; \n" \
+                        "($L | $V | $LV | $LVT) $L; \n" \
+                        "($V | $T) ($LV | $V); \n" \
+                        "$T ($LVT | $T); \n" \
+                        "$Regional_Indicator $Regional_Indicator; \n" \
+                        "$Extend      [^$Control $CR $LF]; \n" \
+                        "$SpacingMark [^$Control $CR $LF]; \n" \
+                        "!!safe_reverse; \n" \
+                        "!!safe_forward; \n";
+
+static const char* BreakIteratorRuleNew =  // supported on newer ICU versions like 62 and up
+                        "!!quoted_literals_only; \n" \
+                        "$CR          = [\\p{Grapheme_Cluster_Break = CR}]; \n" \
+                        "$LF          = [\\p{Grapheme_Cluster_Break = LF}]; \n" \
+                        "$Control     = [[\\p{Grapheme_Cluster_Break = Control}]]; \n" \
+                        "$Extend      = [[\\p{Grapheme_Cluster_Break = Extend}]]; \n" \
+                        "$ZWJ         = [\\p{Grapheme_Cluster_Break = ZWJ}]; \n" \
+                        "$Regional_Indicator = [\\p{Grapheme_Cluster_Break = Regional_Indicator}]; \n" \
+                        "$Prepend     = [\\p{Grapheme_Cluster_Break = Prepend}]; \n" \
+                        "$SpacingMark = [\\p{Grapheme_Cluster_Break = SpacingMark}]; \n" \
+                        "$Virama      = [\\p{Gujr}\\p{sc=Telu}\\p{sc=Mlym}\\p{sc=Orya}\\p{sc=Beng}\\p{sc=Deva}&\\p{Indic_Syllabic_Category=Virama}]; \n" \
+                        "$LinkingConsonant = [\\p{Gujr}\\p{sc=Telu}\\p{sc=Mlym}\\p{sc=Orya}\\p{sc=Beng}\\p{sc=Deva}&\\p{Indic_Syllabic_Category=Consonant}]; \n" \
+                        "$ExtCccZwj   = [[\\p{gcb=Extend}-\\p{ccc=0}] \\p{gcb=ZWJ}]; \n" \
+                        "$L           = [\\p{Grapheme_Cluster_Break = L}]; \n" \
+                        "$V           = [\\p{Grapheme_Cluster_Break = V}]; \n" \
+                        "$T           = [\\p{Grapheme_Cluster_Break = T}]; \n" \
+                        "$LV          = [\\p{Grapheme_Cluster_Break = LV}]; \n" \
+                        "$LVT         = [\\p{Grapheme_Cluster_Break = LVT}]; \n" \
+                        "$Extended_Pict = [:ExtPict:]; \n" \
+                        "!!chain; \n" \
+                        "!!lookAheadHardBreak; \n" \
+                        "$L ($L | $V | $LV | $LVT); \n" \
+                        "($LV | $V) ($V | $T); \n" \
+                        "($LVT | $T) $T; \n" \
+                        "[^$Control $CR $LF] ($Extend | $ZWJ); \n" \
+                        "[^$Control $CR $LF] $SpacingMark; \n" \
+                        "$Prepend [^$Control $CR $LF]; \n" \
+                        "$LinkingConsonant $ExtCccZwj* $Virama $ExtCccZwj* $LinkingConsonant; \n" \
+                        "$Extended_Pict $Extend* $ZWJ $Extended_Pict; \n" \
+                        "^$Prepend* $Regional_Indicator $Regional_Indicator / $Regional_Indicator; \n" \
+                        "^$Prepend* $Regional_Indicator $Regional_Indicator; \n" \
+                        ".;";
+
+static UChar* s_breakIteratorRules = NULL;
+
+// When doing string search operations using ICU, it is internally using a break iterator which doesn't allow breaking between some characters according to
+// the Grapheme Cluster Boundary Rules specified in http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules.
+// Unfortunately, not all rules will have the desired behavior we need to get in .NET. For example, the rules don't allow breaking between CR '\r' and LF '\n' characters.
+// When searching for "\n" in a string like "\r\n", will get not found result.
+// We are customizing the break iterator to exclude the CRxLF rule which don't allow breaking between CR and LF.
+// The general rules syntax explained in the doc https://unicode-org.github.io/icu/userguide/boundaryanalysis/break-rules.html.
+// The ICU latest rules definition exist here https://github.com/unicode-org/icu/blob/main/icu4c/source/data/brkitr/rules/char.txt.
+static UBreakIterator* CreateCustomizedBreakIterator()
+{
+    static UChar emptyString[1];
+    UBreakIterator* breaker;
+
+    UErrorCode status = U_ZERO_ERROR;
+    if (s_breakIteratorRules != NULL)
+    {
+        breaker = ubrk_openRules(s_breakIteratorRules, -1, emptyString, 0, NULL, &status);
+        return U_FAILURE(status) ? NULL : breaker;
+    }
+
+    int32_t oldRulesLength = (int32_t)strlen(BreakIteratorRuleOld);
+    int32_t newRulesLength = (int32_t)strlen(BreakIteratorRuleNew);
+
+    int32_t breakIteratorRulesLength = newRulesLength > oldRulesLength ? newRulesLength : oldRulesLength;
+
+    UChar* rules = (UChar*)calloc((size_t)breakIteratorRulesLength + 1, sizeof(UChar));
+    if (rules == NULL)
+    {
+        return NULL;
+    }
+
+    u_uastrncpy(rules, BreakIteratorRuleNew, newRulesLength);
+    rules[newRulesLength] = '\0';
+
+    breaker = ubrk_openRules(rules, newRulesLength, emptyString, 0, NULL, &status);
+    if (U_FAILURE(status))
+    {
+        status = U_ZERO_ERROR;
+        u_uastrncpy(rules, BreakIteratorRuleOld, oldRulesLength);
+        rules[oldRulesLength] = '\0';
+        breaker = ubrk_openRules(rules, oldRulesLength, emptyString, 0, NULL, &status);
+    }
+
+    if (U_FAILURE(status))
+    {
+        free(rules);
+        return NULL;
+    }
+
+    UChar* pNull = NULL;
+    if (!pal_atomic_cas_ptr((void* volatile*)&s_breakIteratorRules, rules, pNull))
+    {
+        free(rules);
+        assert(s_breakIteratorRules != NULL);
+    }
+
+    return breaker;
+}
+
+static void CloseSearchIterator(UStringSearch* pSearch)
+{
+    assert(pSearch != NULL);
+
+#if !defined(TARGET_WINDOWS)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wcast-qual"
+#endif
+    UBreakIterator* breakIterator = (UBreakIterator*)usearch_getBreakIterator(pSearch);
+#if !defined(TARGET_WINDOWS)
+#pragma GCC diagnostic pop
+#endif
+
+    usearch_close(pSearch);
+
+    if (breakIterator != NULL)
+    {
+        ubrk_close(breakIterator);
+    }
+}
+
 void GlobalizationNative_CloseSortHandle(SortHandle* pSortHandle)
 {
     for (int i = 0; i <= CompareOptionsMask; i++)
@@ -460,7 +604,7 @@ void GlobalizationNative_CloseSortHandle(SortHandle* pSortHandle)
             {
                 if (pSearch != USED_STRING_SEARCH)
                 {
-                    usearch_close(pSearch);
+                    CloseSearchIterator(pSearch);
                 }
                 pSortHandle->searchIteratorList[i].searchIterator = NULL;
                 SearchIteratorNode* pNext = pSortHandle->searchIteratorList[i].next;
@@ -470,7 +614,7 @@ void GlobalizationNative_CloseSortHandle(SortHandle* pSortHandle)
                 {
                     if (pNext->searchIterator != NULL && pNext->searchIterator != USED_STRING_SEARCH)
                     {
-                        usearch_close(pNext->searchIterator);
+                        CloseSearchIterator(pNext->searchIterator);
                     }
                     SearchIteratorNode* pCurrent = pNext;
                     pNext = pCurrent->next;
@@ -518,7 +662,7 @@ static const UCollator* GetCollatorFromSortHandle(SortHandle* pSortHandle, int32
 // CreateNewSearchNode will create a new node in the linked list and mark this node search handle as borrowed handle.
 static inline int32_t CreateNewSearchNode(SortHandle* pSortHandle, int32_t options)
 {
-    SearchIteratorNode* node = (SearchIteratorNode*) malloc(sizeof(SearchIteratorNode));
+    SearchIteratorNode* node = (SearchIteratorNode*)calloc(1, sizeof(SearchIteratorNode));
     if (node == NULL)
     {
         return false;
@@ -581,9 +725,14 @@ static int32_t GetSearchIteratorUsingCollator(
 
     if (*pSearchIterator == NULL)
     {
-        *pSearchIterator = usearch_openFromCollator(lpTarget, cwTargetLength, lpSource, cwSourceLength, pColl, NULL, &err);
+        UBreakIterator* breakIterator = CreateCustomizedBreakIterator();
+        *pSearchIterator = usearch_openFromCollator(lpTarget, cwTargetLength, lpSource, cwSourceLength, pColl, breakIterator, &err);
         if (!U_SUCCESS(err))
         {
+            if (breakIterator != NULL)
+            {
+                ubrk_close(breakIterator);
+            }
             assert(false && "Couldn't open the search iterator.");
             return -1;
         }
@@ -593,7 +742,7 @@ static int32_t GetSearchIteratorUsingCollator(
         {
             if (!CreateNewSearchNode(pSortHandle, options))
             {
-                usearch_close(*pSearchIterator);
+                CloseSearchIterator(*pSearchIterator);
                 return -1;
             }
         }
@@ -619,16 +768,22 @@ static int32_t GetSearchIteratorUsingCollator(
 
     if (*pSearchIterator == NULL) // Couldn't find any available handle to borrow then create a new one.
     {
-        *pSearchIterator = usearch_openFromCollator(lpTarget, cwTargetLength, lpSource, cwSourceLength, pColl, NULL, &err);
+        UBreakIterator* breakIterator = CreateCustomizedBreakIterator();
+        *pSearchIterator = usearch_openFromCollator(lpTarget, cwTargetLength, lpSource, cwSourceLength, pColl, breakIterator, &err);
         if (!U_SUCCESS(err))
         {
+            if (breakIterator != NULL)
+            {
+                ubrk_close(breakIterator);
+            }
+
             assert(false && "Couldn't open a new search iterator.");
             return -1;
         }
 
         if (!CreateNewSearchNode(pSortHandle, options))
         {
-            usearch_close(*pSearchIterator);
+            CloseSearchIterator(*pSearchIterator);
             return -1;
         }
 
index b70e31f..92ded52 100644 (file)
@@ -72,7 +72,9 @@ U_CAPI int32_t U_EXPORT2 ucal_getWindowsTimeZoneID(const UChar* id, int32_t len,
     PER_FUNCTION_BLOCK(u_strncpy, libicuuc, true) \
     PER_FUNCTION_BLOCK(u_tolower, libicuuc, true) \
     PER_FUNCTION_BLOCK(u_toupper, libicuuc, true) \
-    PER_FUNCTION_BLOCK(u_uastrcpy, libicuuc, true) \
+    PER_FUNCTION_BLOCK(u_uastrncpy, libicuuc, true) \
+    PER_FUNCTION_BLOCK(ubrk_close, libicui18n, true) \
+    PER_FUNCTION_BLOCK(ubrk_openRules, libicui18n, true) \
     PER_FUNCTION_BLOCK(ucal_add, libicui18n, true) \
     PER_FUNCTION_BLOCK(ucal_close, libicui18n, true) \
     PER_FUNCTION_BLOCK(ucal_get, libicui18n, true) \
@@ -155,6 +157,7 @@ U_CAPI int32_t U_EXPORT2 ucal_getWindowsTimeZoneID(const UChar* id, int32_t len,
     PER_FUNCTION_BLOCK(ures_open, libicuuc, true) \
     PER_FUNCTION_BLOCK(usearch_close, libicui18n, true) \
     PER_FUNCTION_BLOCK(usearch_first, libicui18n, true) \
+    PER_FUNCTION_BLOCK(usearch_getBreakIterator, libicui18n, true) \
     PER_FUNCTION_BLOCK(usearch_getMatchedLength, libicui18n, true) \
     PER_FUNCTION_BLOCK(usearch_last, libicui18n, true) \
     PER_FUNCTION_BLOCK(usearch_openFromCollator, libicui18n, true) \
@@ -215,7 +218,9 @@ FOR_ALL_ICU_FUNCTIONS
 #define u_strncpy(...) u_strncpy_ptr(__VA_ARGS__)
 #define u_tolower(...) u_tolower_ptr(__VA_ARGS__)
 #define u_toupper(...) u_toupper_ptr(__VA_ARGS__)
-#define u_uastrcpy(...) u_uastrcpy_ptr(__VA_ARGS__)
+#define u_uastrncpy(...) u_uastrncpy_ptr(__VA_ARGS__)
+#define ubrk_close(...) ubrk_close_ptr(__VA_ARGS__)
+#define ubrk_openRules(...) ubrk_openRules_ptr(__VA_ARGS__)
 #define ucal_add(...) ucal_add_ptr(__VA_ARGS__)
 #define ucal_close(...) ucal_close_ptr(__VA_ARGS__)
 #define ucal_get(...) ucal_get_ptr(__VA_ARGS__)
@@ -310,6 +315,7 @@ FOR_ALL_ICU_FUNCTIONS
 #define ures_open(...) ures_open_ptr(__VA_ARGS__)
 #define usearch_close(...) usearch_close_ptr(__VA_ARGS__)
 #define usearch_first(...) usearch_first_ptr(__VA_ARGS__)
+#define usearch_getBreakIterator(...) usearch_getBreakIterator_ptr(__VA_ARGS__)
 #define usearch_getMatchedLength(...) usearch_getMatchedLength_ptr(__VA_ARGS__)
 #define usearch_last(...) usearch_last_ptr(__VA_ARGS__)
 #define usearch_openFromCollator(...) usearch_openFromCollator_ptr(__VA_ARGS__)
index 4439787..1125ce9 100644 (file)
@@ -433,7 +433,6 @@ typedef struct UFieldPosition {
     int32_t endIndex;
 } UFieldPosition;
 
-
 void u_charsToUChars(const char * cs, UChar * us, int32_t length);
 void u_getVersion(UVersionInfo versionArray);
 int32_t u_strlen(const UChar * s);
@@ -442,7 +441,9 @@ UChar * u_strcpy(UChar * dst, const UChar * src);
 UChar * u_strncpy(UChar * dst, const UChar * src, int32_t n);
 UChar32 u_tolower(UChar32 c);
 UChar32 u_toupper(UChar32 c);
-UChar* u_uastrcpy(UChar * dst, const char * src);
+UChar* u_uastrncpy(UChar * dst, const char * src, int32_t n);
+void ubrk_close(UBreakIterator * bi);
+UBreakIterator* ubrk_openRules(const UChar * rules, int32_t rulesLength, const UChar * text, int32_t textLength, UParseError * parseErr, UErrorCode * status);
 void ucal_add(UCalendar * cal, UCalendarDateFields field, int32_t amount, UErrorCode * status);
 void ucal_close(UCalendar * cal);
 int32_t ucal_get(const UCalendar * cal, UCalendarDateFields field, UErrorCode * status);
@@ -532,6 +533,7 @@ const UChar * ures_getStringByIndex(const UResourceBundle * resourceBundle, int3
 UResourceBundle * ures_open(const char * packageName, const char * locale, UErrorCode * status);
 void usearch_close(UStringSearch * searchiter);
 int32_t usearch_first(UStringSearch * strsrch, UErrorCode * status);
+const UBreakIterator* usearch_getBreakIterator(const UStringSearch * strsrch);
 int32_t usearch_getMatchedLength(const UStringSearch * strsrch);
 int32_t usearch_last(UStringSearch * strsrch, UErrorCode * status);
 UStringSearch * usearch_openFromCollator(const UChar * pattern, int32_t patternlength, const UChar * text, int32_t textlength, const UCollator * collator, UBreakIterator * breakiter, UErrorCode * status);
index 2620c14..018746e 100644 (file)
@@ -223,7 +223,7 @@ static void FixupTimeZoneGenericDisplayName(const char* locale, const UChar* tim
         }
 
         // Make a UChar[] version of the test time zone id for use in the API calls.
-        u_uastrcpy(testTimeZoneId, testId);
+        u_uastrncpy(testTimeZoneId, testId, TZID_LENGTH);
 
         // Get the standard name from the test time zone.
         GetTimeZoneDisplayName_FromCalendar(locale, testTimeZoneId, timestamp, UCAL_STANDARD, testDisplayName, DISPLAY_NAME_LENGTH, err);
index 960db32..8c19567 100644 (file)
@@ -72,6 +72,7 @@ namespace System.Globalization.Tests
             yield return new object[] { s_invariantCompare, "FooBar", "Foo\u0400Bar", 0, 6, CompareOptions.Ordinal, -1, 0 };
             yield return new object[] { s_invariantCompare, "TestFooBA\u0300R", "FooB\u00C0R", 0, 11, CompareOptions.IgnoreNonSpace, 4, 7 };
             yield return new object[] { s_invariantCompare, "o\u0308", "o", 0, 2, CompareOptions.None, -1, 0 };
+            yield return new object[] { s_invariantCompare, "\r\n", "\n", 0, 2, CompareOptions.None, 1, 1 };
 
             // Weightless characters
             yield return new object[] { s_invariantCompare, "", "\u200d", 0, 0, CompareOptions.None, 0, 0 };
index a400ded..e5bd894 100644 (file)
@@ -86,6 +86,7 @@ namespace System.Globalization.Tests
             yield return new object[] { s_invariantCompare, "FooBar", "Foo\u0400Bar", 5, 6, CompareOptions.Ordinal, -1, 0 };
             yield return new object[] { s_invariantCompare, "TestFooBA\u0300R", "FooB\u00C0R", 10, 11, CompareOptions.IgnoreNonSpace, 4, 7 };
             yield return new object[] { s_invariantCompare, "o\u0308", "o", 1, 2, CompareOptions.None, -1, 0 };
+            yield return new object[] { s_invariantCompare, "\r\n", "\n", 1, 2, CompareOptions.None, 1, 1 };
 
             // Weightless characters
             // NLS matches weightless characters at the end of the string