Remove unsafe code and make scheme parsing faster in Uri (#90087)
authorMiha Zupan <mihazupan.zupan1@gmail.com>
Thu, 10 Aug 2023 17:10:21 +0000 (19:10 +0200)
committerGitHub <noreply@github.com>
Thu, 10 Aug 2023 17:10:21 +0000 (10:10 -0700)
* Remove unsafe code and make scheme parsing faster in Uri

* Reduce the number of range checks

* Switch loop to IndexOf

src/libraries/System.Private.Uri/src/System/DomainNameHelper.cs
src/libraries/System.Private.Uri/src/System/Uri.cs
src/libraries/System.Private.Uri/src/System/UriHelper.cs

index 89e1408..1a2d80b 100644 (file)
@@ -77,11 +77,7 @@ namespace System
             if (index >= 0)
             {
                 // We saw uppercase letters. Avoid allocating both the substring and the lower-cased variant.
-                return string.Create(end - start, (str, start), static (buffer, state) =>
-                {
-                    int newLength = state.str.AsSpan(state.start, buffer.Length).ToLowerInvariant(buffer);
-                    Debug.Assert(newLength == buffer.Length);
-                });
+                return UriHelper.SpanToLowerInvariantString(str.AsSpan(start, end - start));
             }
 
             if (str.AsSpan(start, end - start) is Localhost or Loopback)
index 3db22a0..770e035 100644 (file)
@@ -610,8 +610,10 @@ namespace System
                         break;
                     }
 
-                    UriParser? syntax = null;
-                    if (CheckSchemeSyntax(relativeStr.AsSpan(0, i), ref syntax) == ParsingError.None)
+                    ParsingError error = ParsingError.None;
+                    UriParser? syntax = CheckSchemeSyntax(relativeStr.AsSpan(0, i), ref error);
+
+                    if (error == ParsingError.None)
                     {
                         if (baseUri.Syntax == syntax)
                         {
@@ -1909,17 +1911,30 @@ namespace System
             if (length >= c_MaxUriBufferSize)
                 return ParsingError.SizeLimit;
 
-            //STEP1: parse scheme, lookup this Uri Syntax or create one using UnknownV1SyntaxFlags uri syntax template
-            fixed (char* pUriString = uriString)
+            // Fast path for valid http(s) schemes with no leading whitespace that are expected to be very common.
+            if (uriString.StartsWith("https:", StringComparison.OrdinalIgnoreCase))
+            {
+                syntax = UriParser.HttpsUri;
+                flags |= (Flags)6;
+            }
+            else if (uriString.StartsWith("http:", StringComparison.OrdinalIgnoreCase))
             {
+                syntax = UriParser.HttpUri;
+                flags |= (Flags)5;
+            }
+            else
+            {
+                // STEP1: parse scheme, lookup this Uri Syntax or create one using UnknownV1SyntaxFlags uri syntax template
                 ParsingError err = ParsingError.None;
-                int idx = ParseSchemeCheckImplicitFile(pUriString, length, ref err, ref flags, ref syntax);
+                int idx = ParseSchemeCheckImplicitFile(uriString, ref err, ref flags, ref syntax);
+                Debug.Assert((err is ParsingError.None) == (syntax is not null));
 
                 if (err != ParsingError.None)
                     return err;
 
                 flags |= (Flags)idx;
             }
+
             return ParsingError.None;
         }
 
@@ -3592,108 +3607,92 @@ namespace System
             InterlockedSetFlags(cF);
         }
 
-        //
         // verifies the syntax of the scheme part
         // Checks on implicit File: scheme due to simple Dos/Unc path passed
         // returns the start of the next component  position
-        // throws UriFormatException if invalid scheme
-        //
-        private static unsafe int ParseSchemeCheckImplicitFile(char* uriString, int length,
-            ref ParsingError err, ref Flags flags, ref UriParser? syntax)
+        private static int ParseSchemeCheckImplicitFile(string uriString, ref ParsingError err, ref Flags flags, ref UriParser? syntax)
         {
+            Debug.Assert(err == ParsingError.None);
             Debug.Assert((flags & Flags.Debug_LeftConstructor) == 0);
 
-            int idx = 0;
+            int i = 0;
 
-            //skip whitespace
-            while (idx < length && UriHelper.IsLWS(uriString[idx]))
+            // skip whitespace
+            while ((uint)i < (uint)uriString.Length && UriHelper.IsLWS(uriString[i]))
             {
-                ++idx;
+                i++;
             }
 
             // Unix: Unix path?
             // A path starting with 2 / or \ (including mixed) is treated as UNC and will be matched below
-            if (!OperatingSystem.IsWindows() && idx < length && uriString[idx] == '/' &&
-                (idx + 1 == length || (uriString[idx + 1] != '/' && uriString[idx + 1] != '\\')))
+            if (!OperatingSystem.IsWindows() &&
+                (uint)i < (uint)uriString.Length && uriString[i] == '/' &&
+                ((uint)(i + 1) >= (uint)uriString.Length || uriString[i + 1] is not ('/' or '\\')))
             {
                 flags |= (Flags.UnixPath | Flags.ImplicitFile | Flags.AuthorityFound);
                 syntax = UriParser.UnixFileUri;
-                return idx;
+                return i;
             }
 
-            // sets the recognizer for well known registered schemes
-            // file, ftp, http, https, uuid, etc
+            // Find the colon.
             // Note that we don't support one-letter schemes that will be put into a DOS path bucket
+            int colonOffset = uriString.AsSpan(i).IndexOf(':');
 
-            int end = idx;
-            while (end < length && uriString[end] != ':')
-            {
-                ++end;
-            }
-
-            // NB: On 64-bits we will use less optimized code from CheckSchemeSyntax()
-            //
-            if (IntPtr.Size == 4)
-            {
-                // long = 4chars: The minimal size of a known scheme is 2 + ':'
-                if (end != length && end >= idx + 2 &&
-                    CheckKnownSchemes((long*)(uriString + idx), end - idx, ref syntax))
-                {
-                    return end + 1;
-                }
-            }
-
-            //NB: A string must have at least 3 characters and at least 1 before ':'
-            if (idx + 2 >= length || end == idx)
+            // NB: A string must have at least 3 characters and at least 1 before ':'
+            if ((uint)(i + 2) >= (uint)uriString.Length ||
+                colonOffset == 0 ||
+                // Redundant checks to eliminate range checks below
+                (uint)i >= (uint)uriString.Length ||
+                (uint)(i + 1) >= (uint)uriString.Length)
             {
                 err = ParsingError.BadFormat;
                 return 0;
             }
 
-            //Check for supported special cases like a DOS file path OR a UNC share path
-            //NB: A string may not have ':' if this is a UNC path
+            // Check for supported special cases like a DOS file path OR a UNC share path
+            // NB: A string may not have ':' if this is a UNC path
+            if (uriString[i + 1] is ':' or '|')
             {
-                char c;
-                if ((c = uriString[idx + 1]) == ':' || c == '|')
+                // DOS-like path?
+                if (char.IsAsciiLetter(uriString[i]))
                 {
-                    //DOS-like path?
-                    if (char.IsAsciiLetter(uriString[idx]))
+                    if (uriString[i + 2] is '\\' or '/')
                     {
-                        if ((c = uriString[idx + 2]) == '\\' || c == '/')
-                        {
-                            flags |= (Flags.DosPath | Flags.ImplicitFile | Flags.AuthorityFound);
-                            syntax = UriParser.FileUri;
-                            return idx;
-                        }
-                        err = ParsingError.MustRootedPath;
-                        return 0;
+                        flags |= (Flags.DosPath | Flags.ImplicitFile | Flags.AuthorityFound);
+                        syntax = UriParser.FileUri;
+                        return i;
                     }
-                    if (c == ':')
-                        err = ParsingError.BadScheme;
-                    else
-                        err = ParsingError.BadFormat;
+
+                    err = ParsingError.MustRootedPath;
                     return 0;
                 }
-                else if ((c = uriString[idx]) == '/' || c == '\\')
+
+                err = uriString[i + 1] == ':' ? ParsingError.BadScheme : ParsingError.BadFormat;
+                return 0;
+            }
+            else if (uriString[i] is '/' or '\\')
+            {
+                // UNC share?
+                if (uriString[i + 1] is '\\' or '/')
                 {
-                    //UNC share?
-                    if ((c = uriString[idx + 1]) == '\\' || c == '/')
-                    {
-                        flags |= (Flags.UncPath | Flags.ImplicitFile | Flags.AuthorityFound);
-                        syntax = UriParser.FileUri;
-                        idx += 2;
-                        // V1.1 compat this will simply eat any slashes prepended to a UNC path
-                        while (idx < length && ((c = uriString[idx]) == '/' || c == '\\'))
-                            ++idx;
+                    flags |= (Flags.UncPath | Flags.ImplicitFile | Flags.AuthorityFound);
+                    syntax = UriParser.FileUri;
+                    i += 2;
 
-                        return idx;
+                    // V1.1 compat this will simply eat any slashes prepended to a UNC path
+                    while ((uint)i < (uint)uriString.Length && uriString[i] is '/' or '\\')
+                    {
+                        i++;
                     }
-                    err = ParsingError.BadFormat;
-                    return 0;
+
+                    return i;
                 }
+
+                err = ParsingError.BadFormat;
+                return 0;
             }
 
-            if (end == length)
+            if (colonOffset < 0)
             {
                 err = ParsingError.BadFormat;
                 return 0;
@@ -3701,275 +3700,75 @@ namespace System
 
             // This is a potentially valid scheme, but we have not identified it yet.
             // Check for illegal characters, canonicalize, and check the length.
-            err = CheckSchemeSyntax(new ReadOnlySpan<char>(uriString + idx, end - idx), ref syntax!);
-            if (err != ParsingError.None)
+            syntax = CheckSchemeSyntax(uriString.AsSpan(i, colonOffset), ref err);
+            if (syntax is null)
             {
                 return 0;
             }
-            return end + 1;
+            return i + colonOffset + 1;
         }
 
-        //
-        // Quickly parses well known schemes.
-        // nChars does not include the last ':'. Assuming there is one at the end of passed buffer
-        private static unsafe bool CheckKnownSchemes(long* lptr, int nChars, ref UriParser? syntax)
+        // This will check whether a scheme string follows the rules
+        private static UriParser? CheckSchemeSyntax(ReadOnlySpan<char> scheme, ref ParsingError error)
         {
-            //NOTE beware of too short input buffers!
-
-            const long _HTTP_Mask0 = 'h' | ('t' << 16) | ((long)'t' << 32) | ((long)'p' << 48);
-            const char _HTTPS_Mask1 = 's';
-            const int _WS_Mask = 'w' | ('s' << 16);
-            const long _WSS_Mask = 'w' | ('s' << 16) | ((long)'s' << 32) | ((long)':' << 48);
-            const long _FTP_Mask = 'f' | ('t' << 16) | ((long)'p' << 32) | ((long)':' << 48);
-            const long _FILE_Mask0 = 'f' | ('i' << 16) | ((long)'l' << 32) | ((long)'e' << 48);
-            const long _GOPHER_Mask0 = 'g' | ('o' << 16) | ((long)'p' << 32) | ((long)'h' << 48);
-            const int _GOPHER_Mask1 = 'e' | ('r' << 16);
-            const long _MAILTO_Mask0 = 'm' | ('a' << 16) | ((long)'i' << 32) | ((long)'l' << 48);
-            const int _MAILTO_Mask1 = 't' | ('o' << 16);
-            const long _NEWS_Mask0 = 'n' | ('e' << 16) | ((long)'w' << 32) | ((long)'s' << 48);
-            const long _NNTP_Mask0 = 'n' | ('n' << 16) | ((long)'t' << 32) | ((long)'p' << 48);
-            const long _UUID_Mask0 = 'u' | ('u' << 16) | ((long)'i' << 32) | ((long)'d' << 48);
-
-            const long _TELNET_Mask0 = 't' | ('e' << 16) | ((long)'l' << 32) | ((long)'n' << 48);
-            const int _TELNET_Mask1 = 'e' | ('t' << 16);
-
-            const long _NETXXX_Mask0 = 'n' | ('e' << 16) | ((long)'t' << 32) | ((long)'.' << 48);
-            const long _NETTCP_Mask1 = 't' | ('c' << 16) | ((long)'p' << 32) | ((long)':' << 48);
-            const long _NETPIPE_Mask1 = 'p' | ('i' << 16) | ((long)'p' << 32) | ((long)'e' << 48);
-
-            const long _LDAP_Mask0 = 'l' | ('d' << 16) | ((long)'a' << 32) | ((long)'p' << 48);
-
-
-            const long _LOWERCASE_Mask = 0x0020002000200020L;
-            const int _INT_LOWERCASE_Mask = 0x00200020;
-
-            if (nChars == 2)
-            {
-                // This is the only known scheme of length 2
-                if ((unchecked((int)*lptr) | _INT_LOWERCASE_Mask) == _WS_Mask)
-                {
-                    syntax = UriParser.WsUri;
-                    return true;
-                }
-                return false;
-            }
+            Debug.Assert(error == ParsingError.None);
 
-            //Map to a known scheme if possible
-            //upgrade 4 letters to ASCII lower case, keep a false case to stay false
-            switch (*lptr | _LOWERCASE_Mask)
+            switch (scheme.Length)
             {
-                case _HTTP_Mask0:
-                    if (nChars == 4)
-                    {
-                        syntax = UriParser.HttpUri;
-                        return true;
-                    }
-                    if (nChars == 5 && ((*(char*)(lptr + 1)) | 0x20) == _HTTPS_Mask1)
-                    {
-                        syntax = UriParser.HttpsUri;
-                        return true;
-                    }
-                    break;
-                case _WSS_Mask:
-                    if (nChars == 3)
-                    {
-                        syntax = UriParser.WssUri;
-                        return true;
-                    }
-                    break;
-                case _FILE_Mask0:
-                    if (nChars == 4)
-                    {
-                        syntax = UriParser.FileUri;
-                        return true;
-                    }
-                    break;
-                case _FTP_Mask:
-                    if (nChars == 3)
-                    {
-                        syntax = UriParser.FtpUri;
-                        return true;
-                    }
-                    break;
-
-                case _NEWS_Mask0:
-                    if (nChars == 4)
-                    {
-                        syntax = UriParser.NewsUri;
-                        return true;
-                    }
-                    break;
-
-                case _NNTP_Mask0:
-                    if (nChars == 4)
-                    {
-                        syntax = UriParser.NntpUri;
-                        return true;
-                    }
+                case 2:
+                    if (scheme.Equals("ws", StringComparison.OrdinalIgnoreCase)) return UriParser.WsUri;
                     break;
 
-                case _UUID_Mask0:
-                    if (nChars == 4)
-                    {
-                        syntax = UriParser.UuidUri;
-                        return true;
-                    }
+                case 3:
+                    if (scheme.Equals("wss", StringComparison.OrdinalIgnoreCase)) return UriParser.WssUri;
+                    if (scheme.Equals("ftp", StringComparison.OrdinalIgnoreCase)) return UriParser.FtpUri;
                     break;
 
-                case _GOPHER_Mask0:
-                    if (nChars == 6 && (*(int*)(lptr + 1) | _INT_LOWERCASE_Mask) == _GOPHER_Mask1)
-                    {
-                        syntax = UriParser.GopherUri;
-                        return true;
-                    }
-                    break;
-                case _MAILTO_Mask0:
-                    if (nChars == 6 && (*(int*)(lptr + 1) | _INT_LOWERCASE_Mask) == _MAILTO_Mask1)
-                    {
-                        syntax = UriParser.MailToUri;
-                        return true;
-                    }
+                case 4:
+                    if (scheme.Equals("http", StringComparison.OrdinalIgnoreCase)) return UriParser.HttpUri;
+                    if (scheme.Equals("file", StringComparison.OrdinalIgnoreCase)) return UriParser.FileUri;
+                    if (scheme.Equals("uuid", StringComparison.OrdinalIgnoreCase)) return UriParser.UuidUri;
+                    if (scheme.Equals("nntp", StringComparison.OrdinalIgnoreCase)) return UriParser.NntpUri;
+                    if (scheme.Equals("ldap", StringComparison.OrdinalIgnoreCase)) return UriParser.LdapUri;
+                    if (scheme.Equals("news", StringComparison.OrdinalIgnoreCase)) return UriParser.NewsUri;
                     break;
 
-                case _TELNET_Mask0:
-                    if (nChars == 6 && (*(int*)(lptr + 1) | _INT_LOWERCASE_Mask) == _TELNET_Mask1)
-                    {
-                        syntax = UriParser.TelnetUri;
-                        return true;
-                    }
+                case 5:
+                    if (scheme.Equals("https", StringComparison.OrdinalIgnoreCase)) return UriParser.HttpsUri;
                     break;
 
-                case _NETXXX_Mask0:
-                    if (nChars == 8 && (*(lptr + 1) | _LOWERCASE_Mask) == _NETPIPE_Mask1)
-                    {
-                        syntax = UriParser.NetPipeUri;
-                        return true;
-                    }
-                    else if (nChars == 7 && (*(lptr + 1) | _LOWERCASE_Mask) == _NETTCP_Mask1)
-                    {
-                        syntax = UriParser.NetTcpUri;
-                        return true;
-                    }
+                case 6:
+                    if (scheme.Equals("mailto", StringComparison.OrdinalIgnoreCase)) return UriParser.MailToUri;
+                    if (scheme.Equals("gopher", StringComparison.OrdinalIgnoreCase)) return UriParser.GopherUri;
+                    if (scheme.Equals("telnet", StringComparison.OrdinalIgnoreCase)) return UriParser.TelnetUri;
                     break;
 
-                case _LDAP_Mask0:
-                    if (nChars == 4)
-                    {
-                        syntax = UriParser.LdapUri;
-                        return true;
-                    }
+                case 7:
+                    if (scheme.Equals("net.tcp", StringComparison.OrdinalIgnoreCase)) return UriParser.NetTcpUri;
                     break;
-                default: break;
-            }
-            return false;
-        }
-
-        //
-        // This will check whether a scheme string follows the rules
-        //
-        private static unsafe ParsingError CheckSchemeSyntax(ReadOnlySpan<char> span, ref UriParser? syntax)
-        {
-            static char ToLowerCaseAscii(char c) => char.IsAsciiLetterUpper(c) ? (char)(c | 0x20) : c;
-
-            if (span.Length == 0)
-            {
-                return ParsingError.BadScheme;
-            }
 
-            // The first character must be an alpha.  Validate that and store it as lower-case, as
-            // all of the fast-path checks need that value.
-            char firstLower = span[0];
-            if (char.IsAsciiLetterUpper(firstLower))
-            {
-                firstLower = (char)(firstLower | 0x20);
-            }
-            else if (!char.IsAsciiLetterLower(firstLower))
-            {
-                return ParsingError.BadScheme;
-            }
-
-            // Special-case common and known schemes to avoid allocations and dictionary lookups in these cases.
-            const int wsMask = 'w' << 8 | 's';
-            const int ftpMask = 'f' << 16 | 't' << 8 | 'p';
-            const int wssMask = 'w' << 16 | 's' << 8 | 's';
-            const int fileMask = 'f' << 24 | 'i' << 16 | 'l' << 8 | 'e';
-            const int httpMask = 'h' << 24 | 't' << 16 | 't' << 8 | 'p';
-            const int mailMask = 'm' << 24 | 'a' << 16 | 'i' << 8 | 'l';
-            switch (span.Length)
-            {
-                case 2:
-                    if (wsMask == (firstLower << 8 | ToLowerCaseAscii(span[1])))
-                    {
-                        syntax = UriParser.WsUri;
-                        return ParsingError.None;
-                    }
-                    break;
-                case 3:
-                    switch (firstLower << 16 | ToLowerCaseAscii(span[1]) << 8 | ToLowerCaseAscii(span[2]))
-                    {
-                        case ftpMask:
-                            syntax = UriParser.FtpUri;
-                            return ParsingError.None;
-                        case wssMask:
-                            syntax = UriParser.WssUri;
-                            return ParsingError.None;
-                    }
-                    break;
-                case 4:
-                    switch (firstLower << 24 | ToLowerCaseAscii(span[1]) << 16 | ToLowerCaseAscii(span[2]) << 8 | ToLowerCaseAscii(span[3]))
-                    {
-                        case httpMask:
-                            syntax = UriParser.HttpUri;
-                            return ParsingError.None;
-                        case fileMask:
-                            syntax = UriParser.FileUri;
-                            return ParsingError.None;
-                    }
-                    break;
-                case 5:
-                    if (httpMask == (firstLower << 24 | ToLowerCaseAscii(span[1]) << 16 | ToLowerCaseAscii(span[2]) << 8 | ToLowerCaseAscii(span[3])) &&
-                        ToLowerCaseAscii(span[4]) == 's')
-                    {
-                        syntax = UriParser.HttpsUri;
-                        return ParsingError.None;
-                    }
-                    break;
-                case 6:
-                    if (mailMask == (firstLower << 24 | ToLowerCaseAscii(span[1]) << 16 | ToLowerCaseAscii(span[2]) << 8 | ToLowerCaseAscii(span[3])) &&
-                        ToLowerCaseAscii(span[4]) == 't' && ToLowerCaseAscii(span[5]) == 'o')
-                    {
-                        syntax = UriParser.MailToUri;
-                        return ParsingError.None;
-                    }
+                case 8:
+                    if (scheme.Equals("net.pipe", StringComparison.OrdinalIgnoreCase)) return UriParser.NetPipeUri;
                     break;
             }
 
-            // The scheme is not known.  Validate all of the characters in the input.
-            for (int i = 1; i < span.Length; i++)
+            // scheme = alpha *(alpha | digit | '+' | '-' | '.')
+            if (scheme.Length == 0 ||
+                !char.IsAsciiLetter(scheme[0]) ||
+                scheme.ContainsAnyExcept(s_schemeChars))
             {
-                char c = span[i];
-                if (!char.IsAsciiLetterOrDigit(c) && c != '+' && c != '-' && c != '.')
-                {
-                    return ParsingError.BadScheme;
-                }
+                error = ParsingError.BadScheme;
+                return null;
             }
 
-            if (span.Length > c_MaxUriSchemeName)
+            if (scheme.Length > c_MaxUriSchemeName)
             {
-                return ParsingError.SchemeLimit;
+                error = ParsingError.SchemeLimit;
+                return null;
             }
 
             // Then look up the syntax in a string-based table.
-#pragma warning disable CS8500 // takes address of managed type
-            ReadOnlySpan<char> tmpSpan = span; // avoid address exposing the span and impacting the other code in the method that uses it
-            string str = string.Create(tmpSpan.Length, (IntPtr)(&tmpSpan), (buffer, spanPtr) =>
-            {
-                int charsWritten = (*(ReadOnlySpan<char>*)spanPtr).ToLowerInvariant(buffer);
-                Debug.Assert(charsWritten == buffer.Length);
-            });
-#pragma warning restore CS8500
-            syntax = UriParser.FindOrFetchAsUnknownV1Syntax(str);
-            return ParsingError.None;
+            return UriParser.FindOrFetchAsUnknownV1Syntax(UriHelper.SpanToLowerInvariantString(scheme));
         }
 
         //
index 3fa5b49..e4edf02 100644 (file)
@@ -10,6 +10,17 @@ namespace System
 {
     internal static class UriHelper
     {
+        public static unsafe string SpanToLowerInvariantString(ReadOnlySpan<char> span)
+        {
+#pragma warning disable CS8500 // takes address of managed type
+            return string.Create(span.Length, (IntPtr)(&span), static (buffer, spanPtr) =>
+            {
+                int charsWritten = (*(ReadOnlySpan<char>*)spanPtr).ToLowerInvariant(buffer);
+                Debug.Assert(charsWritten == buffer.Length);
+            });
+#pragma warning restore CS8500
+        }
+
         // http://host/Path/Path/File?Query is the base of
         //      - http://host/Path/Path/File/ ...    (those "File" words may be different in semantic but anyway)
         //      - http://host/Path/Path/#Fragment