Fix RegexBoyerMoore.IsMatch case-insensitive (regression from 2.1 to 3.0) (#32984)
authorStephen Toub <stoub@microsoft.com>
Fri, 28 Feb 2020 23:12:35 +0000 (18:12 -0500)
committerGitHub <noreply@github.com>
Fri, 28 Feb 2020 23:12:35 +0000 (18:12 -0500)
As part of a larger change, a change was made for .NET Core 3.0 that switched a character-by-character comparison loop to instead use string.Compare.  This represents a regression, however, when a culture-aware character-by-character comparison yields different results from a multiple-character comparison.

src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexBoyerMoore.cs
src/libraries/System.Text.RegularExpressions/tests/RegexCultureTests.cs

index ddf4dd8..03137a8 100644 (file)
@@ -43,7 +43,17 @@ namespace System.Text.RegularExpressions
             // We're doing this for your own protection. (Really, for speed.)
             Debug.Assert(pattern.Length != 0, "RegexBoyerMoore called with an empty string. This is bad for perf");
             Debug.Assert(pattern.Length <= MaxLimit, "RegexBoyerMoore can take a long time for large patterns");
-            Debug.Assert(!caseInsensitive || pattern.ToLower(culture) == pattern, "RegexBoyerMoore called with a pattern which is not lowercased with caseInsensitive true.");
+#if DEBUG
+            if (caseInsensitive)
+            {
+                foreach (char c in pattern)
+                {
+                    // We expect each individual character to have been lower-cased. We don't validate the whole
+                    // string at once because the rest of the library doesn't currently recognize/support surrogate pairs.
+                    Debug.Assert(c == culture.TextInfo.ToLower(c), "Pattern wasn't lowercased with provided culture");
+                }
+            }
+#endif
 
             Pattern = pattern;
             RightToLeft = rightToLeft;
@@ -229,7 +239,17 @@ namespace System.Text.RegularExpressions
 
             if (CaseInsensitive)
             {
-                return string.Compare(Pattern, 0, text, index, Pattern.Length, ignoreCase: true, _culture) == 0;
+                TextInfo textinfo = _culture.TextInfo;
+
+                for (int i = 0; i < Pattern.Length; i++)
+                {
+                    if (Pattern[i] != textinfo.ToLower(text[index + i]))
+                    {
+                        return false;
+                    }
+                }
+
+                return true;
             }
 
             return Pattern.AsSpan().SequenceEqual(text.AsSpan(index, Pattern.Length));
index e7a5106..6666f05 100644 (file)
@@ -12,11 +12,30 @@ namespace System.Text.RegularExpressions.Tests
     public class RegexCultureTests
     {
         [Theory]
+        [InlineData("^aa$", "aA", "da-DK", RegexOptions.None, false)]
+        [InlineData("^aA$", "aA", "da-DK", RegexOptions.None, true)]
+        [InlineData("^aa$", "aA", "da-DK", RegexOptions.IgnoreCase, true)]
+        [InlineData("^aA$", "aA", "da-DK", RegexOptions.IgnoreCase, true)]
+        public void CharactersComparedOneByOne_AnchoredPattern(string pattern, string input, string culture, RegexOptions options, bool expected)
+        {
+            // Regex compares characters one by one.  If that changes, it could impact the behavior of
+            // a case like this, where these characters are not the same, but the strings compare
+            // as equal with the invariant culture (and some other cultures as well).
+            using (new ThreadCultureChange(culture))
+            {
+                foreach (RegexOptions compiled in new[] { RegexOptions.None, RegexOptions.Compiled })
+                {
+                    Assert.Equal(expected, new Regex(pattern, options | compiled).IsMatch(input));
+                }
+            }
+        }
+
+        [Theory]
         [InlineData(RegexOptions.None)]
         [InlineData(RegexOptions.IgnoreCase | RegexOptions.CultureInvariant)]
         [InlineData(RegexOptions.Compiled)]
         [InlineData(RegexOptions.Compiled | RegexOptions.IgnoreCase | RegexOptions.CultureInvariant)]
-        public void CharactersComparedOneByOne(RegexOptions options)
+        public void CharactersComparedOneByOne_Invariant(RegexOptions options)
         {
             // Regex compares characters one by one.  If that changes, it could impact the behavior of
             // a case like this, where these characters are not the same, but the strings compare