Tweak Regex compiler's FindFirstChar code gen (#2342)
authorStephen Toub <stoub@microsoft.com>
Thu, 30 Jan 2020 16:08:25 +0000 (11:08 -0500)
committerGitHub <noreply@github.com>
Thu, 30 Jan 2020 16:08:25 +0000 (11:08 -0500)
* Tweak Regex compiler's FindFirstChar code gen

- If we're able to use IndexOf{Any} and we don't have any information on subsequent characters, we can avoid spitting the code related to looping, avoid unnecessary slicing, etc.
- If the Boyer-Moore prefix contains non-ASCII text, we currently don't use it when compiling FindFirstChar, but we previously made a change to also skip computing other prefix information if we got a Boyer-Moore prefix, which ends up making FindFirstChar terrible when there's Unicode in the prefix string.  This fixes that to still compute the other prefix information in that case.
- We're currently often generating multiple "this.runtextpos = runtextend; return false" blocks.  We can consolidate them.
- Makes a few cleanliness changes to the assembly generator, e.g. ensuring the internal types are sealed, the types are all beforefieldinit as the C# compiler would do, etc.

Also added/tweaked a few tests.

* Address PR feedback

src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexAssemblyCompiler.cs
src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs
src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexWriter.cs
src/libraries/System.Text.RegularExpressions/tests/MonoRegexTests.cs
src/libraries/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs

index a22236d..1e6daf6 100644 (file)
@@ -55,7 +55,7 @@ namespace System.Text.RegularExpressions
             string typenumString = ((uint)Interlocked.Increment(ref s_typeCount)).ToString();
 
             // Generate the RegexRunner-derived type.
-            TypeBuilder regexRunnerTypeBuilder = DefineType(_module, $"{name}Runner{typenumString}", false, typeof(RegexRunner));
+            TypeBuilder regexRunnerTypeBuilder = DefineType(_module, $"{name}Runner{typenumString}", isPublic: false, isSealed: true, typeof(RegexRunner));
             _ilg = DefineMethod(regexRunnerTypeBuilder, "Go", null);
             GenerateGo();
             _ilg = DefineMethod(regexRunnerTypeBuilder, "FindFirstChar", typeof(bool));
@@ -65,13 +65,13 @@ namespace System.Text.RegularExpressions
             Type runnerType = regexRunnerTypeBuilder.CreateType()!;
 
             // Generate the RegexRunnerFactory-derived type.
-            TypeBuilder regexRunnerFactoryTypeBuilder = DefineType(_module, $"{name}Factory{typenumString}", false, typeof(RegexRunnerFactory));
+            TypeBuilder regexRunnerFactoryTypeBuilder = DefineType(_module, $"{name}Factory{typenumString}", isPublic: false, isSealed: true, typeof(RegexRunnerFactory));
             _ilg = DefineMethod(regexRunnerFactoryTypeBuilder, "CreateInstance", typeof(RegexRunner));
             GenerateCreateInstance(runnerType);
             Type regexRunnerFactoryType = regexRunnerFactoryTypeBuilder.CreateType()!;
 
             // Generate the Regex-derived type.
-            TypeBuilder regexTypeBuilder = DefineType(_module, name, isPublic, typeof(Regex));
+            TypeBuilder regexTypeBuilder = DefineType(_module, name, isPublic, isSealed: false, typeof(Regex));
             ConstructorBuilder defaultCtorBuilder = regexTypeBuilder.DefineConstructor(MethodAttributes.Public, CallingConventions.Standard, Type.EmptyTypes);
             _ilg = defaultCtorBuilder.GetILGenerator();
             GenerateRegexDefaultCtor(pattern, options, regexRunnerFactoryType, code, matchTimeout);
@@ -242,8 +242,16 @@ namespace System.Text.RegularExpressions
         }
 
         /// <summary>Begins the definition of a new type with a specified base class</summary>
-        private static TypeBuilder DefineType(ModuleBuilder moduleBuilder, string typeName, bool isPublic, Type inheritFromClass) =>
-            moduleBuilder.DefineType(typeName, (isPublic ? TypeAttributes.Public : TypeAttributes.NotPublic) | TypeAttributes.Class, inheritFromClass);
+        private static TypeBuilder DefineType(ModuleBuilder moduleBuilder, string typeName, bool isPublic, bool isSealed, Type inheritFromClass)
+        {
+            TypeAttributes attrs = TypeAttributes.Class | TypeAttributes.BeforeFieldInit | (isPublic ? TypeAttributes.Public : TypeAttributes.NotPublic);
+            if (isSealed)
+            {
+                attrs |= TypeAttributes.Sealed;
+            }
+
+            return moduleBuilder.DefineType(typeName, attrs, inheritFromClass);
+        }
 
         /// <summary>Begins the definition of a new method (no args) with a specified return value.</summary>
         private static ILGenerator DefineMethod(TypeBuilder typeBuilder, string methname, Type? returnType) =>
index c8ee603..b9aba89 100644 (file)
@@ -969,49 +969,56 @@ namespace System.Text.RegularExpressions
             }
 
             // Generate length check.  If the input isn't long enough to possibly match, fail quickly.
+            // It's rare for min required length to be 0, so we don't bother special-casing the check,
+            // especially since we want the "return false" code regardless.
             int minRequiredLength = _code.Tree.MinRequiredLength;
             Debug.Assert(minRequiredLength >= 0);
-            if (minRequiredLength > 0)
+            Label returnFalse = DefineLabel();
+            Label finishedLengthCheck = DefineLabel();
+            if (!_code.RightToLeft)
             {
-                Label finishedLengthCheck = DefineLabel();
-                if (!_code.RightToLeft)
+                // if (runtextpos > runtextend - _code.Tree.MinRequiredLength)
+                // {
+                //     this.runtextpos = runtextend;
+                //     return false;
+                // }
+                Ldloc(_runtextposLocal);
+                Ldloc(_runtextendLocal);
+                if (minRequiredLength > 0)
                 {
-                    // if (runtextpos > runtextend - _code.Tree.MinRequiredLength)
-                    // {
-                    //     this.runtextpos = runtextend;
-                    //     return false;
-                    // }
-                    Ldloc(_runtextposLocal);
-                    Ldloc(_runtextendLocal);
                     Ldc(minRequiredLength);
                     Sub();
-                    Ble(finishedLengthCheck);
-                    Ldthis();
-                    Ldloc(_runtextendLocal);
-                    Stfld(s_runtextposField);
-                    Ldc(0);
-                    Ret();
                 }
-                else
+                Ble(finishedLengthCheck);
+
+                MarkLabel(returnFalse);
+                Ldthis();
+                Ldloc(_runtextendLocal);
+            }
+            else
+            {
+                // if (runtextpos - _code.Tree.MinRequiredLength < runtextbeg)
+                // {
+                //     this.runtextpos = runtextbeg;
+                //     return false;
+                // }
+                Ldloc(_runtextposLocal);
+                if (minRequiredLength > 0)
                 {
-                    // if (runtextpos - _code.Tree.MinRequiredLength < runtextbeg)
-                    // {
-                    //     runtextpos = runtextbeg;
-                    //     return false;
-                    // }
-                    Ldloc(_runtextposLocal);
                     Ldc(minRequiredLength);
                     Sub();
-                    Ldloc(_runtextbegLocal!);
-                    Bge(finishedLengthCheck);
-                    Ldthis();
-                    Ldloc(_runtextbegLocal!);
-                    Stfld(s_runtextposField);
-                    Ldc(0);
-                    Ret();
                 }
-                MarkLabel(finishedLengthCheck);
+                Ldloc(_runtextbegLocal!);
+                Bge(finishedLengthCheck);
+
+                MarkLabel(returnFalse);
+                Ldthis();
+                Ldloc(_runtextbegLocal!);
             }
+            Stfld(s_runtextposField);
+            Ldc(0);
+            Ret();
+            MarkLabel(finishedLengthCheck);
 
             // Generate anchor checks.
             if ((_anchors & (RegexPrefixAnalyzer.Beginning | RegexPrefixAnalyzer.Start | RegexPrefixAnalyzer.EndZ | RegexPrefixAnalyzer.End)) != 0)
@@ -1024,11 +1031,7 @@ namespace System.Text.RegularExpressions
                         Ldloc(_runtextposLocal);
                         Ldthisfld(s_runtextbegField);
                         Ble(l1);
-                        Ldthis();
-                        Ldloc(_runtextendLocal);
-                        Stfld(s_runtextposField);
-                        Ldc(0);
-                        Ret();
+                        Br(returnFalse);
                         MarkLabel(l1);
                     }
 
@@ -1038,11 +1041,7 @@ namespace System.Text.RegularExpressions
                         Ldloc(_runtextposLocal);
                         Ldthisfld(s_runtextstartField);
                         Ble(l1);
-                        Ldthis();
-                        Ldloc(_runtextendLocal);
-                        Stfld(s_runtextposField);
-                        Ldc(0);
-                        Ret();
+                        BrFar(returnFalse);
                         MarkLabel(l1);
                     }
 
@@ -1085,11 +1084,7 @@ namespace System.Text.RegularExpressions
                         Ldloc(_runtextposLocal);
                         Ldloc(_runtextendLocal);
                         Bge(l1);
-                        Ldthis();
-                        Ldloc(_runtextbegLocal!);
-                        Stfld(s_runtextposField);
-                        Ldc(0);
-                        Ret();
+                        Br(returnFalse);
                         MarkLabel(l1);
                     }
 
@@ -1111,11 +1106,7 @@ namespace System.Text.RegularExpressions
                         Ldc('\n');
                         Beq(l2);
                         MarkLabel(l1);
-                        Ldthis();
-                        Ldloc(_runtextbegLocal!);
-                        Stfld(s_runtextposField);
-                        Ldc(0);
-                        Ret();
+                        BrFar(returnFalse);
                         MarkLabel(l2);
                     }
 
@@ -1125,11 +1116,7 @@ namespace System.Text.RegularExpressions
                         Ldloc(_runtextposLocal);
                         Ldthisfld(s_runtextstartField);
                         Bge(l1);
-                        Ldthis();
-                        Ldloc(_runtextbegLocal!);
-                        Stfld(s_runtextposField);
-                        Ldc(0);
-                        Ret();
+                        BrFar(returnFalse);
                         MarkLabel(l1);
                     }
 
@@ -1158,7 +1145,6 @@ namespace System.Text.RegularExpressions
                 LocalBuilder limitLocal = _temp2Local;
                 Label lDefaultAdvance = DefineLabel();
                 Label lAdvance = DefineLabel();
-                Label lFail = DefineLabel();
                 Label lStart = DefineLabel();
                 Label lPartialMatch = DefineLabel();
 
@@ -1208,11 +1194,11 @@ namespace System.Text.RegularExpressions
                 Ldloc(limitLocal);
                 if (!_code.RightToLeft)
                 {
-                    BgeFar(lFail);
+                    BgeFar(returnFalse);
                 }
                 else
                 {
-                    BltFar(lFail);
+                    BltFar(returnFalse);
                 }
 
                 Rightchar();
@@ -1324,14 +1310,6 @@ namespace System.Text.RegularExpressions
                 Stfld(s_runtextposField);
                 Ldc(1);
                 Ret();
-
-                MarkLabel(lFail);
-
-                Ldthis();
-                Ldloc(_code.RightToLeft ? _runtextbegLocal! : _runtextendLocal);
-                Stfld(s_runtextposField);
-                Ldc(0);
-                Ret();
             }
             else if (_leadingCharClasses is null)
             {
@@ -1422,7 +1400,6 @@ namespace System.Text.RegularExpressions
                 Debug.Assert(_leadingCharClasses != null && _leadingCharClasses.Length > 0);
 
                 LocalBuilder iLocal = _temp2Local;
-                Label returnFalse = DefineLabel();
 
                 // If minRequiredLength > 0, we already output a more stringent check.  In the rare case
                 // where we were unable to get an accurate enough min required length to ensure it's larger
@@ -1445,10 +1422,6 @@ namespace System.Text.RegularExpressions
                 _temp3Local = DeclareReadOnlySpanChar();
                 LocalBuilder textSpanLocal = _temp3Local;
 
-                Label checkSpanLengthLabel = DefineLabel();
-                Label charNotInClassLabel = DefineLabel();
-                Label loopBody = DefineLabel();
-
                 // ReadOnlySpan<char> span = this.runtext.AsSpan(runtextpos, runtextend - runtextpos);
                 Ldthisfld(s_runtextField);
                 Ldloc(_runtextposLocal);
@@ -1458,48 +1431,66 @@ namespace System.Text.RegularExpressions
                 Call(s_stringAsSpanIntIntMethod);
                 Stloc(textSpanLocal);
 
-                // for (int i = 0;
-                Ldc(0);
-                Stloc(iLocal);
-                BrFar(checkSpanLengthLabel);
-
-                MarkLabel(loopBody);
-
                 // If we can use IndexOf{Any}, try to accelerate the skip loop via vectorization to match the first prefix.
                 // We can use it if this is a case-sensitive class with a small number of characters in the class.
                 Span<char> setChars = stackalloc char[3]; // up to 3 characters handled by IndexOf{Any} below
-                int setCharsCount;
-                int charClassIndex = 0;
-                if (!_leadingCharClasses[0].CaseInsensitive &&
-                    (setCharsCount = RegexCharClass.GetSetChars(_leadingCharClasses[0].CharClass, setChars)) > 0)
+                int setCharsCount = 0, charClassIndex = 0;
+                bool canUseIndexOf =
+                    !_leadingCharClasses[0].CaseInsensitive &&
+                    (setCharsCount = RegexCharClass.GetSetChars(_leadingCharClasses[0].CharClass, setChars)) > 0;
+                bool needLoop = !canUseIndexOf || _leadingCharClasses.Length > 1;
+
+                Label checkSpanLengthLabel = default;
+                Label charNotInClassLabel = default;
+                Label loopBody = default;
+                if (needLoop)
                 {
-                    charClassIndex++;
+                    checkSpanLengthLabel = DefineLabel();
+                    charNotInClassLabel = DefineLabel();
+                    loopBody = DefineLabel();
+
+                    // for (int i = 0;
+                    Ldc(0);
+                    Stloc(iLocal);
+                    BrFar(checkSpanLengthLabel);
+                    MarkLabel(loopBody);
+                }
+
+                if (canUseIndexOf)
+                {
+                    charClassIndex = 1;
+
+                    if (needLoop)
+                    {
+                        // textSpan.Slice(iLocal)
+                        Ldloca(textSpanLocal);
+                        Ldloc(iLocal);
+                        Call(s_spanSliceIntMethod);
+                    }
+                    else
+                    {
+                        // textSpan
+                        Ldloc(textSpanLocal);
+                    }
+
                     switch (setCharsCount)
                     {
                         case 1:
-                            // tmp = span.Slice(i).IndexOf(setChars[0]);
-                            Ldloca(textSpanLocal);
-                            Ldloc(iLocal);
-                            Call(s_spanSliceIntMethod);
+                            // tmp = ...IndexOf(setChars[0]);
                             Ldc(setChars[0]);
                             Call(s_spanIndexOf);
                             break;
 
                         case 2:
-                            // tmp = span.Slice(i).IndexOfAny(setChars[0], setChars[1]);
-                            Ldloca(textSpanLocal);
-                            Ldloc(iLocal);
-                            Call(s_spanSliceIntMethod);
+                            // tmp = ...IndexOfAny(setChars[0], setChars[1]);
                             Ldc(setChars[0]);
                             Ldc(setChars[1]);
                             Call(s_spanIndexOfAnyCharChar);
                             break;
 
-                        case 3:
-                            // tmp = span.Slice(i).IndexOfAny(setChars[0], setChars[1], setChars[2]});
-                            Ldloca(textSpanLocal);
-                            Ldloc(iLocal);
-                            Call(s_spanSliceIntMethod);
+                        default: // 3
+                            // tmp = ...IndexOfAny(setChars[0], setChars[1], setChars[2]});
+                            Debug.Assert(setCharsCount == 3);
                             Ldc(setChars[0]);
                             Ldc(setChars[1]);
                             Ldc(setChars[2]);
@@ -1507,18 +1498,22 @@ namespace System.Text.RegularExpressions
                             break;
                     }
 
-                    // i += tmp;
+                    // i = tmp; // or i += tmp if there's a loop
                     // if (tmp < 0) goto returnFalse;
                     Dup();
-                    Ldloc(iLocal);
-                    Add();
+                    if (needLoop)
+                    {
+                        Ldloc(iLocal);
+                        Add();
+                    }
                     Stloc(iLocal);
                     Ldc(0);
                     BltFar(returnFalse);
 
-                    // if (i >= span.Length - (_leadingCharClasses.Length - 1)) goto returnFalse;
+                    // if (i >= textSpan.Length - (_leadingCharClasses.Length - 1)) goto returnFalse;
                     if (_leadingCharClasses.Length > 1)
                     {
+                        Debug.Assert(needLoop);
                         Ldloca(textSpanLocal);
                         Call(s_spanGetLengthMethod);
                         Ldc(_leadingCharClasses.Length - 1);
@@ -1528,13 +1523,14 @@ namespace System.Text.RegularExpressions
                     }
                 }
 
-                // if (!CharInClass(span[i], prefix[0], "...")) goto returnFalse;
-                // if (!CharInClass(span[i + 1], prefix[1], "...")) goto returnFalse;
-                // if (!CharInClass(span[i + 2], prefix[2], "...")) goto returnFalse;
+                // if (!CharInClass(textSpan[i], prefix[0], "...")) goto returnFalse;
+                // if (!CharInClass(textSpan[i + 1], prefix[1], "...")) goto returnFalse;
+                // if (!CharInClass(textSpan[i + 2], prefix[2], "...")) goto returnFalse;
                 // ...
                 Debug.Assert(charClassIndex == 0 || charClassIndex == 1);
                 for ( ; charClassIndex < _leadingCharClasses.Length; charClassIndex++)
                 {
+                    Debug.Assert(needLoop);
                     Ldloca(textSpanLocal);
                     Ldloc(iLocal);
                     if (charClassIndex > 0)
@@ -1558,32 +1554,32 @@ namespace System.Text.RegularExpressions
                 Ldc(1);
                 Ret();
 
-                // for (...; ...; i++)
-                MarkLabel(charNotInClassLabel);
-                Ldloc(iLocal);
-                Ldc(1);
-                Add();
-                Stloc(iLocal);
-
-                // for (...; i < span.Length - (_leadingCharClasses.Length - 1); ...);
-                MarkLabel(checkSpanLengthLabel);
-                Ldloc(iLocal);
-                Ldloca(textSpanLocal);
-                Call(s_spanGetLengthMethod);
-                if (_leadingCharClasses.Length > 1)
+                if (needLoop)
                 {
-                    Ldc(_leadingCharClasses.Length - 1);
-                    Sub();
-                }
-                BltFar(loopBody);
+                    MarkLabel(charNotInClassLabel);
 
-                // runtextpos = runtextend;
-                MarkLabel(returnFalse);
-                Ldthis();
-                Ldloc(_runtextendLocal);
-                Stfld(s_runtextposField);
-                Ldc(0);
-                Ret();
+                    // for (...; ...; i++)
+                    Ldloc(iLocal);
+                    Ldc(1);
+                    Add();
+                    Stloc(iLocal);
+
+                    // for (...; i < span.Length - (_leadingCharClasses.Length - 1); ...);
+                    MarkLabel(checkSpanLengthLabel);
+                    Ldloc(iLocal);
+                    Ldloca(textSpanLocal);
+                    Call(s_spanGetLengthMethod);
+                    if (_leadingCharClasses.Length > 1)
+                    {
+                        Ldc(_leadingCharClasses.Length - 1);
+                        Sub();
+                    }
+                    BltFar(loopBody);
+
+                    // runtextpos = runtextend;
+                    // return false;
+                    BrFar(returnFalse);
+                }
             }
         }
 
index dcb6f0a..6828011 100644 (file)
@@ -133,6 +133,7 @@ namespace System.Text.RegularExpressions
             int[] emitted = _emitted.AsSpan().ToArray();
 
             bool rtl = (tree.Options & RegexOptions.RightToLeft) != 0;
+            bool compiled = (tree.Options & RegexOptions.Compiled) != 0;
 
             // Compute prefixes to help optimize FindFirstChar.
             RegexBoyerMoore? boyerMoorePrefix = null;
@@ -145,9 +146,14 @@ namespace System.Text.RegularExpressions
                 CultureInfo culture = (tree.Options & RegexOptions.CultureInvariant) != 0 ? CultureInfo.InvariantCulture : CultureInfo.CurrentCulture;
                 boyerMoorePrefix = new RegexBoyerMoore(leadingSubstring, leadingSubstringCI, rtl, culture);
             }
-            else
+
+            // If we didn't find a single leading substring, or if we found one but we won't be able to use it for a Boyer-Moore
+            // search, try to compute the characters set that might begin the string.
+            if (boyerMoorePrefix is null ||
+                (boyerMoorePrefix.NegativeUnicode != null && compiled)) // compilation won't use Boyer-Moore if it has a negative Unicode table
             {
-                // If we didn't find a single leading substring, try to compute the characters set that might begin the string.
+                boyerMoorePrefix = null;
+
                 // First we employ a less aggressive but more valuable computation to see if we can find sets for each of the first N
                 // characters in the string.  If that's unsuccessful, we employ a more aggressive check to compute a set for just
                 // the first character in the string.
index 76b508c..86b6c40 100644 (file)
@@ -9,7 +9,6 @@
 //         (c) 2002
 
 using System.Collections.Generic;
-using System.Diagnostics;
 using Xunit;
 
 namespace System.Text.RegularExpressions.Tests
@@ -23,16 +22,16 @@ namespace System.Text.RegularExpressions.Tests
         [MemberData(nameof(RegexTestCasesWithOptions))]
         public void ValidateRegex(string pattern, RegexOptions options, string input, string expected)
         {
-            string result;
+            string result = "Fail.";
             try
             {
                 var re = new Regex(pattern, options);
-                int[] groupNums = re.GetGroupNumbers();
                 Match m = re.Match(input);
 
                 if (m.Success)
                 {
                     result = "Pass.";
+                    int[] groupNums = re.GetGroupNumbers();
                     for (int i = 0; i < m.Groups.Count; ++i)
                     {
                         int gid = groupNums[i];
@@ -45,12 +44,8 @@ namespace System.Text.RegularExpressions.Tests
                         }
                     }
                 }
-                else
-                {
-                    result = "Fail.";
-                }
             }
-            catch
+            catch (ArgumentException)
             {
                 result = "Error.";
             }
@@ -62,7 +57,7 @@ namespace System.Text.RegularExpressions.Tests
         {
             foreach (object[] obj in RegexTestCases())
             {
-                yield return new object[] { obj[0], (RegexOptions)obj[1], obj[2], obj[3] };
+                yield return new object[] { obj[0], obj[1], obj[2], obj[3] };
                 yield return new object[] { obj[0], RegexOptions.CultureInvariant | (RegexOptions)obj[1], obj[2], obj[3] };
                 yield return new object[] { obj[0], RegexOptions.Compiled | (RegexOptions)obj[1], obj[2], obj[3] };
                 yield return new object[] { obj[0], RegexOptions.Compiled | RegexOptions.CultureInvariant | (RegexOptions)obj[1], obj[2], obj[3] };
@@ -1064,7 +1059,7 @@ namespace System.Text.RegularExpressions.Tests
             yield return new object[] { @"a{1,2147483647}", RegexOptions.None, "a", "Pass. Group[0]=(0,1)" };
             yield return new object[] { @"^((\[(?<NAME>[^\]]+)\])|(?<NAME>[^\.\[\]]+))$", RegexOptions.None, "[a]", "Pass. Group[0]=(0,3) Group[1]=(0,3) Group[2]=(0,3) Group[3]=(1,1)" };
 
-            //// Ported from https://github.com/mono/mono/blob/0f2995e95e98e082c7c7039e17175cf2c6a00034/mcs/class/System/Test/System.Text.RegularExpressions/RegexMatchTests.cs
+            // Ported from https://github.com/mono/mono/blob/0f2995e95e98e082c7c7039e17175cf2c6a00034/mcs/class/System/Test/System.Text.RegularExpressions/RegexMatchTests.cs
             yield return new object[] { @"(a)(b)(c)", RegexOptions.ExplicitCapture, "abc", "Pass. Group[0]=(0,3)" };
             yield return new object[] { @"(a)(?<1>b)(c)", RegexOptions.ExplicitCapture, "abc", "Pass. Group[0]=(0,3) Group[1]=(1,1)" };
             yield return new object[] { @"(a)(?<2>b)(c)", RegexOptions.None, "abc", "Pass. Group[0]=(0,3) Group[1]=(0,1) Group[2]=(1,1)(2,1)" };
index a892ffe..f1e3a18 100644 (file)
@@ -327,8 +327,17 @@ namespace System.Text.RegularExpressions.Tests
                 yield return new object[] { @"(cat)(\c[*)(dog)", "asdlkcat\u00FFdogiwod", RegexOptions.None, 0, 15, false, string.Empty };
             }
 
-            // Surrogate pairs splitted up into UTF-16 code units.
+            // Surrogate pairs split up into UTF-16 code units.
             yield return new object[] { @"(\uD82F[\uDCA0-\uDCA3])", "\uD82F\uDCA2", RegexOptions.CultureInvariant, 0, 2, true, "\uD82F\uDCA2" };
+
+            // Unicode text
+            foreach (RegexOptions options in new[] { RegexOptions.None, RegexOptions.RightToLeft, RegexOptions.IgnoreCase | RegexOptions.CultureInvariant })
+            {
+                yield return new object[] { "\u05D0\u05D1\u05D2\u05D3(\u05D4\u05D5|\u05D6\u05D7|\u05D8)", "abc\u05D0\u05D1\u05D2\u05D3\u05D4\u05D5def", options, 3, 6, true, "\u05D0\u05D1\u05D2\u05D3\u05D4\u05D5" };
+                yield return new object[] { "\u05D0(\u05D4\u05D5|\u05D6\u05D7|\u05D8)", "\u05D0\u05D8", options, 0, 2, true, "\u05D0\u05D8" };
+                yield return new object[] { "\u05D0(?:\u05D1|\u05D2|\u05D3)", "\u05D0\u05D2", options, 0, 2, true, "\u05D0\u05D2" };
+                yield return new object[] { "\u05D0(?:\u05D1|\u05D2|\u05D3)", "\u05D0\u05D4", options, 0, 0, false, "" };
+            }
         }
 
         [Theory]