Use IndexOf{Any} for case-sensitive singletons in FindFirstChar
authorStephen Toub <stoub@microsoft.com>
Mon, 16 Dec 2019 23:19:49 +0000 (18:19 -0500)
committerStephen Toub <stoub@microsoft.com>
Thu, 9 Jan 2020 03:50:08 +0000 (22:50 -0500)
Takes advantage of vectorization in string.IndexOf.

src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs
src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs

index 0ec1ba8..d46c3bb 100644 (file)
@@ -769,6 +769,47 @@ namespace System.Text.RegularExpressions
             !IsSubtraction(set) &&
             (set[SetStartIndex] == LastChar || set[SetStartIndex] + 1 == set[SetStartIndex + 1]);
 
+        /// <summary>Gets all of the characters in the specified set, storing them into the provided span.</summary>
+        /// <param name="set">The character class.</param>
+        /// <param name="chars">The span into which the chars should be stored.</param>
+        /// <returns>
+        /// The number of stored chars.  If they won't all fit, 0 is returned.
+        /// </returns>
+        /// <remarks>
+        /// Only considers character classes that only contain sets (no categories), no negation,
+        /// and no subtraction... just simple sets containing starting/ending pairs.
+        /// </remarks>
+        public static int GetSetChars(string set, Span<char> chars)
+        {
+            int setLength = set[SetLengthIndex];
+            if (setLength == 0 ||
+                setLength % 2 != 0 ||
+                set[CategoryLengthIndex] != 0 ||
+                IsNegated(set) ||
+                IsSubtraction(set))
+            {
+                return 0;
+            }
+
+            int count = 0;
+            for (int i = SetStartIndex; i < SetStartIndex + setLength; i += 2)
+            {
+                int curSetStart = set[i];
+                int curSetEnd = set[i + 1];
+                for (int c = curSetStart; c < curSetEnd; c++)
+                {
+                    if (count >= chars.Length)
+                    {
+                        return 0;
+                    }
+
+                    chars[count++] = (char)c;
+                }
+            }
+
+            return count;
+        }
+
         internal static bool IsSubtraction(string charClass) =>
             charClass.Length > SetStartIndex +
             charClass[CategoryLengthIndex] +
index 5dee39d..27d9c91 100644 (file)
@@ -48,6 +48,9 @@ namespace System.Text.RegularExpressions
         private static readonly MethodInfo s_charIsWhiteSpaceMethod = typeof(char).GetMethod("IsWhiteSpace", new Type[] { typeof(char) })!;
         private static readonly MethodInfo s_stringGetCharsMethod = typeof(string).GetMethod("get_Chars", new Type[] { typeof(int) })!;
         private static readonly MethodInfo s_stringAsSpanMethod = typeof(MemoryExtensions).GetMethod("AsSpan", new Type[] { typeof(string), typeof(int), typeof(int) })!;
+        private static readonly MethodInfo s_stringIndexOf = typeof(string).GetMethod("IndexOf", new Type[] { typeof(char), typeof(int), typeof(int) })!;
+        private static readonly MethodInfo s_spanIndexOfAnyCharChar = typeof(MemoryExtensions).GetMethod("IndexOfAny", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), Type.MakeGenericMethodParameter(0), Type.MakeGenericMethodParameter(0) })!.MakeGenericMethod(typeof(char));
+        private static readonly MethodInfo s_spanIndexOfAnyCharCharChar = typeof(MemoryExtensions).GetMethod("IndexOfAny", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), Type.MakeGenericMethodParameter(0), Type.MakeGenericMethodParameter(0), Type.MakeGenericMethodParameter(0) })!.MakeGenericMethod(typeof(char));
         private static readonly MethodInfo s_spanGetItemMethod = typeof(ReadOnlySpan<char>).GetMethod("get_Item", new Type[] { typeof(int) })!;
         private static readonly MethodInfo s_spanGetLengthMethod = typeof(ReadOnlySpan<char>).GetMethod("get_Length")!;
         private static readonly MethodInfo s_cultureInfoGetCurrentCultureMethod = typeof(CultureInfo).GetMethod("get_CurrentCulture")!;
@@ -1201,20 +1204,11 @@ namespace System.Text.RegularExpressions
                 Ldc(0);
                 Ret();
             }
-            else // for left-to-right, use span to avoid bounds checks when doing normal forward iteration recognized by the JIT
+            else // for left-to-right, we can take advantage of vectorization and JIT optimizations
             {
-                LocalBuilder charInClassLocal = _temp1Local;
                 LocalBuilder iLocal = _temp2Local;
-                _temp3Local = DeclareReadOnlySpanChar();
-                LocalBuilder textSpanLocal = _temp3Local;
-
                 Label returnFalseLabel = DefineLabel();
-                Label checkSpanLengthLabel = DefineLabel();
-                Label loopBody = DefineLabel();
-                Label charNotInClassLabel = DefineLabel();
-
-                // string runtext = this.runtext
-                Mvfldloc(s_runtextField, _runtextLocal);
+                Label updatePosAndReturnFalse = DefineLabel();
 
                 // if (runtextend - runtextpos > 0)
                 Ldthisfld(s_runtextendField);
@@ -1223,53 +1217,146 @@ namespace System.Text.RegularExpressions
                 Ldc(0);
                 BleFar(returnFalseLabel);
 
-                // ReadOnlySpan<char> span = runtext.AsSpan(runtextpos, runtextend - runtextpos);
-                Ldloc(_runtextLocal);
-                Ldthisfld(s_runtextposField);
-                Ldthisfld(s_runtextendField);
-                Ldthisfld(s_runtextposField);
-                Sub();
-                Call(s_stringAsSpanMethod);
-                Stloc(textSpanLocal);
+                // string runtext = this.runtext
+                Mvfldloc(s_runtextField, _runtextLocal);
 
-                // for (int i = 0;
-                Ldc(0);
-                Stloc(iLocal);
-                BrFar(checkSpanLengthLabel);
-
-                // if (CharInClass(span[i], "..."))
-                MarkLabel(loopBody);
-                Ldloca(textSpanLocal);
-                Ldloc(iLocal);
-                Call(s_spanGetItemMethod);
-                LdindU2();
-                EmitCallCharInClass(_fcPrefix.GetValueOrDefault().Prefix, _fcPrefix.GetValueOrDefault().CaseInsensitive, charInClassLocal);
-                BrfalseFar(charNotInClassLabel);
-
-                // runtextpos += i; return true;
-                Ldthis();
-                Ldthisfld(s_runtextposField);
-                Ldloc(iLocal);
-                Add();
-                Stfld(s_runtextposField);
-                Ldc(1);
-                Ret();
+                Span<char> setChars = stackalloc char[3];
+                int setCharsCount;
+                if (!_fcPrefix.GetValueOrDefault().CaseInsensitive &&
+                    (setCharsCount = RegexCharClass.GetSetChars(_fcPrefix.GetValueOrDefault().Prefix, setChars)) > 0)
+                {
+                    // This is a case-sensitive class with a small number of characters in the class, small enough
+                    // that we can generate an IndexOf{Any} call.  That takes advantage of optimizations in
+                    // IndexOf{Any}, such as vectorization, which our open-coded loop through the span doesn't have.
+                    switch (setCharsCount)
+                    {
+                        case 1:
+                            // int i = runtext.IndexOf(setChars[0], runtextpos, runtextend - runtextpos);
+                            Ldloc(_runtextLocal);
+                            Ldc(setChars[0]);
+                            Ldthisfld(s_runtextposField);
+                            Ldthisfld(s_runtextendField);
+                            Ldthisfld(s_runtextposField);
+                            Sub();
+                            Call(s_stringIndexOf);
+                            Stloc(iLocal);
 
-                // for (...; ...; i++)
-                MarkLabel(charNotInClassLabel);
-                Ldloc(iLocal);
-                Ldc(1);
-                Add();
-                Stloc(iLocal);
+                            // if (i >= 0)
+                            Ldloc(iLocal);
+                            Ldc(0);
+                            BltFar(updatePosAndReturnFalse);
+
+                            // runtextpos = i; return true;
+                            Mvlocfld(iLocal, s_runtextposField);
+                            Ldc(1);
+                            Ret();
+                            break;
+
+                        case 2:
+                        case 3:
+                            // int i = runtext.AsSpan(runtextpos, runtextend - runtextpos).IndexOfAny(setChars[0], setChars[1]{, setChars[2]});
+                            Ldloc(_runtextLocal);
+                            Ldthisfld(s_runtextposField);
+                            Ldthisfld(s_runtextendField);
+                            Ldthisfld(s_runtextposField);
+                            Sub();
+                            Call(s_stringAsSpanMethod);
+                            Ldc(setChars[0]);
+                            Ldc(setChars[1]);
+                            if (setCharsCount == 3)
+                            {
+                                Ldc(setChars[2]);
+                                Call(s_spanIndexOfAnyCharCharChar);
+                            }
+                            else
+                            {
+                                Call(s_spanIndexOfAnyCharChar);
+                            }
+                            Stloc(iLocal);
+
+                            // if (i >= 0)
+                            Ldloc(iLocal);
+                            Ldc(0);
+                            BltFar(updatePosAndReturnFalse);
+
+                            // runtextpos = i; return true;
+                            Ldthis();
+                            Ldthisfld(s_runtextposField);
+                            Ldloc(iLocal);
+                            Add();
+                            Stfld(s_runtextposField);
+                            Ldc(1);
+                            Ret();
+                            break;
+
+                        default:
+                            Debug.Fail("Unexpected setCharsCount: " + setCharsCount);
+                            break;
+                    }
+                }
+                else
+                {
+                    // Either this isn't a class with just a few characters in it, or this is case insensitive.
+                    // Either way, create a span and iterate through it rather than the original string in order
+                    // to avoid bounds checks on each access.
+
+                    LocalBuilder charInClassLocal = _temp1Local;
+                    _temp3Local = DeclareReadOnlySpanChar();
+                    LocalBuilder textSpanLocal = _temp3Local;
 
-                // for (...; i < span.Length; ...);
-                MarkLabel(checkSpanLengthLabel);
-                Ldloc(iLocal);
-                Ldloca(textSpanLocal);
-                Call(s_spanGetLengthMethod);
-                BltFar(loopBody);
+                    Label checkSpanLengthLabel = DefineLabel();
+                    Label charNotInClassLabel = DefineLabel();
+                    Label loopBody = DefineLabel();
+
+                    // ReadOnlySpan<char> span = runtext.AsSpan(runtextpos, runtextend - runtextpos);
+                    Ldloc(_runtextLocal);
+                    Ldthisfld(s_runtextposField);
+                    Ldthisfld(s_runtextendField);
+                    Ldthisfld(s_runtextposField);
+                    Sub();
+                    Call(s_stringAsSpanMethod);
+                    Stloc(textSpanLocal);
+
+                    // for (int i = 0;
+                    Ldc(0);
+                    Stloc(iLocal);
+                    BrFar(checkSpanLengthLabel);
+
+                    // if (CharInClass(span[i], "..."))
+                    MarkLabel(loopBody);
+                    Ldloca(textSpanLocal);
+                    Ldloc(iLocal);
+                    Call(s_spanGetItemMethod);
+                    LdindU2();
+                    EmitCallCharInClass(_fcPrefix.GetValueOrDefault().Prefix, _fcPrefix.GetValueOrDefault().CaseInsensitive, charInClassLocal);
+                    BrfalseFar(charNotInClassLabel);
+
+                    // runtextpos += i; return true;
+                    Ldthis();
+                    Ldthisfld(s_runtextposField);
+                    Ldloc(iLocal);
+                    Add();
+                    Stfld(s_runtextposField);
+                    Ldc(1);
+                    Ret();
+
+                    // for (...; ...; i++)
+                    MarkLabel(charNotInClassLabel);
+                    Ldloc(iLocal);
+                    Ldc(1);
+                    Add();
+                    Stloc(iLocal);
+
+                    // for (...; i < span.Length; ...);
+                    MarkLabel(checkSpanLengthLabel);
+                    Ldloc(iLocal);
+                    Ldloca(textSpanLocal);
+                    Call(s_spanGetLengthMethod);
+                    BltFar(loopBody);
+                }
 
                 // runtextpos = runtextend;
+                MarkLabel(updatePosAndReturnFalse);
                 Ldthis();
                 Ldthisfld(s_runtextendField);
                 Stfld(s_runtextposField);