From: Miha Zupan Date: Thu, 10 Aug 2023 17:10:21 +0000 (+0200) Subject: Remove unsafe code and make scheme parsing faster in Uri (#90087) X-Git-Tag: accepted/tizen/unified/riscv/20231226.055536~374 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=6f6e2482c6d97ba17b337d6a1a55971286b53927;p=platform%2Fupstream%2Fdotnet%2Fruntime.git Remove unsafe code and make scheme parsing faster in Uri (#90087) * Remove unsafe code and make scheme parsing faster in Uri * Reduce the number of range checks * Switch loop to IndexOf --- diff --git a/src/libraries/System.Private.Uri/src/System/DomainNameHelper.cs b/src/libraries/System.Private.Uri/src/System/DomainNameHelper.cs index 89e1408..1a2d80b 100644 --- a/src/libraries/System.Private.Uri/src/System/DomainNameHelper.cs +++ b/src/libraries/System.Private.Uri/src/System/DomainNameHelper.cs @@ -77,11 +77,7 @@ namespace System if (index >= 0) { // We saw uppercase letters. Avoid allocating both the substring and the lower-cased variant. - return string.Create(end - start, (str, start), static (buffer, state) => - { - int newLength = state.str.AsSpan(state.start, buffer.Length).ToLowerInvariant(buffer); - Debug.Assert(newLength == buffer.Length); - }); + return UriHelper.SpanToLowerInvariantString(str.AsSpan(start, end - start)); } if (str.AsSpan(start, end - start) is Localhost or Loopback) diff --git a/src/libraries/System.Private.Uri/src/System/Uri.cs b/src/libraries/System.Private.Uri/src/System/Uri.cs index 3db22a0..770e035 100644 --- a/src/libraries/System.Private.Uri/src/System/Uri.cs +++ b/src/libraries/System.Private.Uri/src/System/Uri.cs @@ -610,8 +610,10 @@ namespace System break; } - UriParser? syntax = null; - if (CheckSchemeSyntax(relativeStr.AsSpan(0, i), ref syntax) == ParsingError.None) + ParsingError error = ParsingError.None; + UriParser? syntax = CheckSchemeSyntax(relativeStr.AsSpan(0, i), ref error); + + if (error == ParsingError.None) { if (baseUri.Syntax == syntax) { @@ -1909,17 +1911,30 @@ namespace System if (length >= c_MaxUriBufferSize) return ParsingError.SizeLimit; - //STEP1: parse scheme, lookup this Uri Syntax or create one using UnknownV1SyntaxFlags uri syntax template - fixed (char* pUriString = uriString) + // Fast path for valid http(s) schemes with no leading whitespace that are expected to be very common. + if (uriString.StartsWith("https:", StringComparison.OrdinalIgnoreCase)) + { + syntax = UriParser.HttpsUri; + flags |= (Flags)6; + } + else if (uriString.StartsWith("http:", StringComparison.OrdinalIgnoreCase)) { + syntax = UriParser.HttpUri; + flags |= (Flags)5; + } + else + { + // STEP1: parse scheme, lookup this Uri Syntax or create one using UnknownV1SyntaxFlags uri syntax template ParsingError err = ParsingError.None; - int idx = ParseSchemeCheckImplicitFile(pUriString, length, ref err, ref flags, ref syntax); + int idx = ParseSchemeCheckImplicitFile(uriString, ref err, ref flags, ref syntax); + Debug.Assert((err is ParsingError.None) == (syntax is not null)); if (err != ParsingError.None) return err; flags |= (Flags)idx; } + return ParsingError.None; } @@ -3592,108 +3607,92 @@ namespace System InterlockedSetFlags(cF); } - // // verifies the syntax of the scheme part // Checks on implicit File: scheme due to simple Dos/Unc path passed // returns the start of the next component position - // throws UriFormatException if invalid scheme - // - private static unsafe int ParseSchemeCheckImplicitFile(char* uriString, int length, - ref ParsingError err, ref Flags flags, ref UriParser? syntax) + private static int ParseSchemeCheckImplicitFile(string uriString, ref ParsingError err, ref Flags flags, ref UriParser? syntax) { + Debug.Assert(err == ParsingError.None); Debug.Assert((flags & Flags.Debug_LeftConstructor) == 0); - int idx = 0; + int i = 0; - //skip whitespace - while (idx < length && UriHelper.IsLWS(uriString[idx])) + // skip whitespace + while ((uint)i < (uint)uriString.Length && UriHelper.IsLWS(uriString[i])) { - ++idx; + i++; } // Unix: Unix path? // A path starting with 2 / or \ (including mixed) is treated as UNC and will be matched below - if (!OperatingSystem.IsWindows() && idx < length && uriString[idx] == '/' && - (idx + 1 == length || (uriString[idx + 1] != '/' && uriString[idx + 1] != '\\'))) + if (!OperatingSystem.IsWindows() && + (uint)i < (uint)uriString.Length && uriString[i] == '/' && + ((uint)(i + 1) >= (uint)uriString.Length || uriString[i + 1] is not ('/' or '\\'))) { flags |= (Flags.UnixPath | Flags.ImplicitFile | Flags.AuthorityFound); syntax = UriParser.UnixFileUri; - return idx; + return i; } - // sets the recognizer for well known registered schemes - // file, ftp, http, https, uuid, etc + // Find the colon. // Note that we don't support one-letter schemes that will be put into a DOS path bucket + int colonOffset = uriString.AsSpan(i).IndexOf(':'); - int end = idx; - while (end < length && uriString[end] != ':') - { - ++end; - } - - // NB: On 64-bits we will use less optimized code from CheckSchemeSyntax() - // - if (IntPtr.Size == 4) - { - // long = 4chars: The minimal size of a known scheme is 2 + ':' - if (end != length && end >= idx + 2 && - CheckKnownSchemes((long*)(uriString + idx), end - idx, ref syntax)) - { - return end + 1; - } - } - - //NB: A string must have at least 3 characters and at least 1 before ':' - if (idx + 2 >= length || end == idx) + // NB: A string must have at least 3 characters and at least 1 before ':' + if ((uint)(i + 2) >= (uint)uriString.Length || + colonOffset == 0 || + // Redundant checks to eliminate range checks below + (uint)i >= (uint)uriString.Length || + (uint)(i + 1) >= (uint)uriString.Length) { err = ParsingError.BadFormat; return 0; } - //Check for supported special cases like a DOS file path OR a UNC share path - //NB: A string may not have ':' if this is a UNC path + // Check for supported special cases like a DOS file path OR a UNC share path + // NB: A string may not have ':' if this is a UNC path + if (uriString[i + 1] is ':' or '|') { - char c; - if ((c = uriString[idx + 1]) == ':' || c == '|') + // DOS-like path? + if (char.IsAsciiLetter(uriString[i])) { - //DOS-like path? - if (char.IsAsciiLetter(uriString[idx])) + if (uriString[i + 2] is '\\' or '/') { - if ((c = uriString[idx + 2]) == '\\' || c == '/') - { - flags |= (Flags.DosPath | Flags.ImplicitFile | Flags.AuthorityFound); - syntax = UriParser.FileUri; - return idx; - } - err = ParsingError.MustRootedPath; - return 0; + flags |= (Flags.DosPath | Flags.ImplicitFile | Flags.AuthorityFound); + syntax = UriParser.FileUri; + return i; } - if (c == ':') - err = ParsingError.BadScheme; - else - err = ParsingError.BadFormat; + + err = ParsingError.MustRootedPath; return 0; } - else if ((c = uriString[idx]) == '/' || c == '\\') + + err = uriString[i + 1] == ':' ? ParsingError.BadScheme : ParsingError.BadFormat; + return 0; + } + else if (uriString[i] is '/' or '\\') + { + // UNC share? + if (uriString[i + 1] is '\\' or '/') { - //UNC share? - if ((c = uriString[idx + 1]) == '\\' || c == '/') - { - flags |= (Flags.UncPath | Flags.ImplicitFile | Flags.AuthorityFound); - syntax = UriParser.FileUri; - idx += 2; - // V1.1 compat this will simply eat any slashes prepended to a UNC path - while (idx < length && ((c = uriString[idx]) == '/' || c == '\\')) - ++idx; + flags |= (Flags.UncPath | Flags.ImplicitFile | Flags.AuthorityFound); + syntax = UriParser.FileUri; + i += 2; - return idx; + // V1.1 compat this will simply eat any slashes prepended to a UNC path + while ((uint)i < (uint)uriString.Length && uriString[i] is '/' or '\\') + { + i++; } - err = ParsingError.BadFormat; - return 0; + + return i; } + + err = ParsingError.BadFormat; + return 0; } - if (end == length) + if (colonOffset < 0) { err = ParsingError.BadFormat; return 0; @@ -3701,275 +3700,75 @@ namespace System // This is a potentially valid scheme, but we have not identified it yet. // Check for illegal characters, canonicalize, and check the length. - err = CheckSchemeSyntax(new ReadOnlySpan(uriString + idx, end - idx), ref syntax!); - if (err != ParsingError.None) + syntax = CheckSchemeSyntax(uriString.AsSpan(i, colonOffset), ref err); + if (syntax is null) { return 0; } - return end + 1; + return i + colonOffset + 1; } - // - // Quickly parses well known schemes. - // nChars does not include the last ':'. Assuming there is one at the end of passed buffer - private static unsafe bool CheckKnownSchemes(long* lptr, int nChars, ref UriParser? syntax) + // This will check whether a scheme string follows the rules + private static UriParser? CheckSchemeSyntax(ReadOnlySpan scheme, ref ParsingError error) { - //NOTE beware of too short input buffers! - - const long _HTTP_Mask0 = 'h' | ('t' << 16) | ((long)'t' << 32) | ((long)'p' << 48); - const char _HTTPS_Mask1 = 's'; - const int _WS_Mask = 'w' | ('s' << 16); - const long _WSS_Mask = 'w' | ('s' << 16) | ((long)'s' << 32) | ((long)':' << 48); - const long _FTP_Mask = 'f' | ('t' << 16) | ((long)'p' << 32) | ((long)':' << 48); - const long _FILE_Mask0 = 'f' | ('i' << 16) | ((long)'l' << 32) | ((long)'e' << 48); - const long _GOPHER_Mask0 = 'g' | ('o' << 16) | ((long)'p' << 32) | ((long)'h' << 48); - const int _GOPHER_Mask1 = 'e' | ('r' << 16); - const long _MAILTO_Mask0 = 'm' | ('a' << 16) | ((long)'i' << 32) | ((long)'l' << 48); - const int _MAILTO_Mask1 = 't' | ('o' << 16); - const long _NEWS_Mask0 = 'n' | ('e' << 16) | ((long)'w' << 32) | ((long)'s' << 48); - const long _NNTP_Mask0 = 'n' | ('n' << 16) | ((long)'t' << 32) | ((long)'p' << 48); - const long _UUID_Mask0 = 'u' | ('u' << 16) | ((long)'i' << 32) | ((long)'d' << 48); - - const long _TELNET_Mask0 = 't' | ('e' << 16) | ((long)'l' << 32) | ((long)'n' << 48); - const int _TELNET_Mask1 = 'e' | ('t' << 16); - - const long _NETXXX_Mask0 = 'n' | ('e' << 16) | ((long)'t' << 32) | ((long)'.' << 48); - const long _NETTCP_Mask1 = 't' | ('c' << 16) | ((long)'p' << 32) | ((long)':' << 48); - const long _NETPIPE_Mask1 = 'p' | ('i' << 16) | ((long)'p' << 32) | ((long)'e' << 48); - - const long _LDAP_Mask0 = 'l' | ('d' << 16) | ((long)'a' << 32) | ((long)'p' << 48); - - - const long _LOWERCASE_Mask = 0x0020002000200020L; - const int _INT_LOWERCASE_Mask = 0x00200020; - - if (nChars == 2) - { - // This is the only known scheme of length 2 - if ((unchecked((int)*lptr) | _INT_LOWERCASE_Mask) == _WS_Mask) - { - syntax = UriParser.WsUri; - return true; - } - return false; - } + Debug.Assert(error == ParsingError.None); - //Map to a known scheme if possible - //upgrade 4 letters to ASCII lower case, keep a false case to stay false - switch (*lptr | _LOWERCASE_Mask) + switch (scheme.Length) { - case _HTTP_Mask0: - if (nChars == 4) - { - syntax = UriParser.HttpUri; - return true; - } - if (nChars == 5 && ((*(char*)(lptr + 1)) | 0x20) == _HTTPS_Mask1) - { - syntax = UriParser.HttpsUri; - return true; - } - break; - case _WSS_Mask: - if (nChars == 3) - { - syntax = UriParser.WssUri; - return true; - } - break; - case _FILE_Mask0: - if (nChars == 4) - { - syntax = UriParser.FileUri; - return true; - } - break; - case _FTP_Mask: - if (nChars == 3) - { - syntax = UriParser.FtpUri; - return true; - } - break; - - case _NEWS_Mask0: - if (nChars == 4) - { - syntax = UriParser.NewsUri; - return true; - } - break; - - case _NNTP_Mask0: - if (nChars == 4) - { - syntax = UriParser.NntpUri; - return true; - } + case 2: + if (scheme.Equals("ws", StringComparison.OrdinalIgnoreCase)) return UriParser.WsUri; break; - case _UUID_Mask0: - if (nChars == 4) - { - syntax = UriParser.UuidUri; - return true; - } + case 3: + if (scheme.Equals("wss", StringComparison.OrdinalIgnoreCase)) return UriParser.WssUri; + if (scheme.Equals("ftp", StringComparison.OrdinalIgnoreCase)) return UriParser.FtpUri; break; - case _GOPHER_Mask0: - if (nChars == 6 && (*(int*)(lptr + 1) | _INT_LOWERCASE_Mask) == _GOPHER_Mask1) - { - syntax = UriParser.GopherUri; - return true; - } - break; - case _MAILTO_Mask0: - if (nChars == 6 && (*(int*)(lptr + 1) | _INT_LOWERCASE_Mask) == _MAILTO_Mask1) - { - syntax = UriParser.MailToUri; - return true; - } + case 4: + if (scheme.Equals("http", StringComparison.OrdinalIgnoreCase)) return UriParser.HttpUri; + if (scheme.Equals("file", StringComparison.OrdinalIgnoreCase)) return UriParser.FileUri; + if (scheme.Equals("uuid", StringComparison.OrdinalIgnoreCase)) return UriParser.UuidUri; + if (scheme.Equals("nntp", StringComparison.OrdinalIgnoreCase)) return UriParser.NntpUri; + if (scheme.Equals("ldap", StringComparison.OrdinalIgnoreCase)) return UriParser.LdapUri; + if (scheme.Equals("news", StringComparison.OrdinalIgnoreCase)) return UriParser.NewsUri; break; - case _TELNET_Mask0: - if (nChars == 6 && (*(int*)(lptr + 1) | _INT_LOWERCASE_Mask) == _TELNET_Mask1) - { - syntax = UriParser.TelnetUri; - return true; - } + case 5: + if (scheme.Equals("https", StringComparison.OrdinalIgnoreCase)) return UriParser.HttpsUri; break; - case _NETXXX_Mask0: - if (nChars == 8 && (*(lptr + 1) | _LOWERCASE_Mask) == _NETPIPE_Mask1) - { - syntax = UriParser.NetPipeUri; - return true; - } - else if (nChars == 7 && (*(lptr + 1) | _LOWERCASE_Mask) == _NETTCP_Mask1) - { - syntax = UriParser.NetTcpUri; - return true; - } + case 6: + if (scheme.Equals("mailto", StringComparison.OrdinalIgnoreCase)) return UriParser.MailToUri; + if (scheme.Equals("gopher", StringComparison.OrdinalIgnoreCase)) return UriParser.GopherUri; + if (scheme.Equals("telnet", StringComparison.OrdinalIgnoreCase)) return UriParser.TelnetUri; break; - case _LDAP_Mask0: - if (nChars == 4) - { - syntax = UriParser.LdapUri; - return true; - } + case 7: + if (scheme.Equals("net.tcp", StringComparison.OrdinalIgnoreCase)) return UriParser.NetTcpUri; break; - default: break; - } - return false; - } - - // - // This will check whether a scheme string follows the rules - // - private static unsafe ParsingError CheckSchemeSyntax(ReadOnlySpan span, ref UriParser? syntax) - { - static char ToLowerCaseAscii(char c) => char.IsAsciiLetterUpper(c) ? (char)(c | 0x20) : c; - - if (span.Length == 0) - { - return ParsingError.BadScheme; - } - // The first character must be an alpha. Validate that and store it as lower-case, as - // all of the fast-path checks need that value. - char firstLower = span[0]; - if (char.IsAsciiLetterUpper(firstLower)) - { - firstLower = (char)(firstLower | 0x20); - } - else if (!char.IsAsciiLetterLower(firstLower)) - { - return ParsingError.BadScheme; - } - - // Special-case common and known schemes to avoid allocations and dictionary lookups in these cases. - const int wsMask = 'w' << 8 | 's'; - const int ftpMask = 'f' << 16 | 't' << 8 | 'p'; - const int wssMask = 'w' << 16 | 's' << 8 | 's'; - const int fileMask = 'f' << 24 | 'i' << 16 | 'l' << 8 | 'e'; - const int httpMask = 'h' << 24 | 't' << 16 | 't' << 8 | 'p'; - const int mailMask = 'm' << 24 | 'a' << 16 | 'i' << 8 | 'l'; - switch (span.Length) - { - case 2: - if (wsMask == (firstLower << 8 | ToLowerCaseAscii(span[1]))) - { - syntax = UriParser.WsUri; - return ParsingError.None; - } - break; - case 3: - switch (firstLower << 16 | ToLowerCaseAscii(span[1]) << 8 | ToLowerCaseAscii(span[2])) - { - case ftpMask: - syntax = UriParser.FtpUri; - return ParsingError.None; - case wssMask: - syntax = UriParser.WssUri; - return ParsingError.None; - } - break; - case 4: - switch (firstLower << 24 | ToLowerCaseAscii(span[1]) << 16 | ToLowerCaseAscii(span[2]) << 8 | ToLowerCaseAscii(span[3])) - { - case httpMask: - syntax = UriParser.HttpUri; - return ParsingError.None; - case fileMask: - syntax = UriParser.FileUri; - return ParsingError.None; - } - break; - case 5: - if (httpMask == (firstLower << 24 | ToLowerCaseAscii(span[1]) << 16 | ToLowerCaseAscii(span[2]) << 8 | ToLowerCaseAscii(span[3])) && - ToLowerCaseAscii(span[4]) == 's') - { - syntax = UriParser.HttpsUri; - return ParsingError.None; - } - break; - case 6: - if (mailMask == (firstLower << 24 | ToLowerCaseAscii(span[1]) << 16 | ToLowerCaseAscii(span[2]) << 8 | ToLowerCaseAscii(span[3])) && - ToLowerCaseAscii(span[4]) == 't' && ToLowerCaseAscii(span[5]) == 'o') - { - syntax = UriParser.MailToUri; - return ParsingError.None; - } + case 8: + if (scheme.Equals("net.pipe", StringComparison.OrdinalIgnoreCase)) return UriParser.NetPipeUri; break; } - // The scheme is not known. Validate all of the characters in the input. - for (int i = 1; i < span.Length; i++) + // scheme = alpha *(alpha | digit | '+' | '-' | '.') + if (scheme.Length == 0 || + !char.IsAsciiLetter(scheme[0]) || + scheme.ContainsAnyExcept(s_schemeChars)) { - char c = span[i]; - if (!char.IsAsciiLetterOrDigit(c) && c != '+' && c != '-' && c != '.') - { - return ParsingError.BadScheme; - } + error = ParsingError.BadScheme; + return null; } - if (span.Length > c_MaxUriSchemeName) + if (scheme.Length > c_MaxUriSchemeName) { - return ParsingError.SchemeLimit; + error = ParsingError.SchemeLimit; + return null; } // Then look up the syntax in a string-based table. -#pragma warning disable CS8500 // takes address of managed type - ReadOnlySpan tmpSpan = span; // avoid address exposing the span and impacting the other code in the method that uses it - string str = string.Create(tmpSpan.Length, (IntPtr)(&tmpSpan), (buffer, spanPtr) => - { - int charsWritten = (*(ReadOnlySpan*)spanPtr).ToLowerInvariant(buffer); - Debug.Assert(charsWritten == buffer.Length); - }); -#pragma warning restore CS8500 - syntax = UriParser.FindOrFetchAsUnknownV1Syntax(str); - return ParsingError.None; + return UriParser.FindOrFetchAsUnknownV1Syntax(UriHelper.SpanToLowerInvariantString(scheme)); } // diff --git a/src/libraries/System.Private.Uri/src/System/UriHelper.cs b/src/libraries/System.Private.Uri/src/System/UriHelper.cs index 3fa5b49..e4edf02 100644 --- a/src/libraries/System.Private.Uri/src/System/UriHelper.cs +++ b/src/libraries/System.Private.Uri/src/System/UriHelper.cs @@ -10,6 +10,17 @@ namespace System { internal static class UriHelper { + public static unsafe string SpanToLowerInvariantString(ReadOnlySpan span) + { +#pragma warning disable CS8500 // takes address of managed type + return string.Create(span.Length, (IntPtr)(&span), static (buffer, spanPtr) => + { + int charsWritten = (*(ReadOnlySpan*)spanPtr).ToLowerInvariant(buffer); + Debug.Assert(charsWritten == buffer.Length); + }); +#pragma warning restore CS8500 + } + // http://host/Path/Path/File?Query is the base of // - http://host/Path/Path/File/ ... (those "File" words may be different in semantic but anyway) // - http://host/Path/Path/#Fragment