From: JinWang An Date: Wed, 21 Jun 2023 09:07:33 +0000 (+0900) Subject: [CVE-2022-0391] bpo-43882 - urllib.parse should sanitize urls containing ASCII newlin... X-Git-Tag: submit/tizen_6.0_base/20230622.055407~1 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=refs%2Fchanges%2F74%2F294574%2F1;p=platform%2Fupstream%2Fpython3.git [CVE-2022-0391] bpo-43882 - urllib.parse should sanitize urls containing ASCII newline and tabs. (GH-25923) Co-authored-by: Gregory P. Smith Co-authored-by: Serhiy Storchaka (cherry picked from commit 76cd81d60310d65d01f9d7b48a8985d8ab89c8b4) Co-authored-by: Senthil Kumaran (cherry picked from commit 515a7bc4e13645d0945b46a8e1d9102b918cd407) Co-authored-by: Miss Islington (bot) <31488909+miss-islington@users.noreply.github.com> From f4dac7ec55477a6c5d965e594e74bd6bda786903 Mon Sep 17 00:00:00 2001 From: "Miss Islington (bot)" <31488909+miss-islington@users.noreply.github.com> Date: Thu, 6 May 2021 09:52:36 -0700 Change-Id: I91ca61624a632f61e79348d26d59bfbc6154afe9 Signed-off-by: JinWang An --- diff --git a/Doc/library/urllib.parse.rst b/Doc/library/urllib.parse.rst index eca8777a..d4ecdd07 100644 --- a/Doc/library/urllib.parse.rst +++ b/Doc/library/urllib.parse.rst @@ -293,10 +293,13 @@ or on combining URL components into a URL string. Characters that affect netloc parsing under NFKC normalization will now raise :exc:`ValueError`. - .. versionchanged:: 3.7.17 + .. versionchanged:: 3.7.11 + ASCII newline and tab characters are stripped from the URL. + + ..quf versionchanged:: 3.7.17 Leading WHATWG C0 control and space characters are stripped from the URL. - +.. _WHATWG spec: https://url.spec.whatwg.org/#concept-basic-url-parser .. function:: urlunsplit(parts) @@ -659,6 +662,11 @@ task isn't already covered by the URL parsing functions above. .. seealso:: + `WHATWG`_ - URL Living standard + Working Group for the URL Standard that defines URLs, domains, IP addresses, the + application/x-www-form-urlencoded format, and their API. + + :rfc:`3986` - Uniform Resource Identifiers This is the current standard (STD66). Any changes to urllib.parse module should conform to this. Certain deviations could be observed, which are @@ -682,3 +690,5 @@ task isn't already covered by the URL parsing functions above. :rfc:`1738` - Uniform Resource Locators (URL) This specifies the formal syntax and semantics of absolute URLs. + +.. _WHATWG: https://url.spec.whatwg.org/ diff --git a/Lib/test/test_urlparse.py b/Lib/test/test_urlparse.py index d2fc8357..b8c775f1 100644 --- a/Lib/test/test_urlparse.py +++ b/Lib/test/test_urlparse.py @@ -623,6 +623,55 @@ class UrlParseTestCase(unittest.TestCase): p = urllib.parse.urlsplit(url) with self.assertRaisesRegex(ValueError, "out of range"): p.port + + def test_urlsplit_remove_unsafe_bytes(self): + # Remove ASCII tabs and newlines from input, for http common case scenario. + url = "h\nttp://www.python\n.org\t/java\nscript:\talert('msg\r\n')/?query\n=\tsomething#frag\nment" + p = urllib.parse.urlsplit(url) + self.assertEqual(p.scheme, "http") + self.assertEqual(p.netloc, "www.python.org") + self.assertEqual(p.path, "/javascript:alert('msg')/") + self.assertEqual(p.query, "query=something") + self.assertEqual(p.fragment, "fragment") + self.assertEqual(p.username, None) + self.assertEqual(p.password, None) + self.assertEqual(p.hostname, "www.python.org") + self.assertEqual(p.port, None) + self.assertEqual(p.geturl(), "http://www.python.org/javascript:alert('msg')/?query=something#fragment") + + # Remove ASCII tabs and newlines from input as bytes, for http common case scenario. + url = b"h\nttp://www.python\n.org\t/java\nscript:\talert('msg\r\n')/?query\n=\tsomething#frag\nment" + p = urllib.parse.urlsplit(url) + self.assertEqual(p.scheme, b"http") + self.assertEqual(p.netloc, b"www.python.org") + self.assertEqual(p.path, b"/javascript:alert('msg')/") + self.assertEqual(p.query, b"query=something") + self.assertEqual(p.fragment, b"fragment") + self.assertEqual(p.username, None) + self.assertEqual(p.password, None) + self.assertEqual(p.hostname, b"www.python.org") + self.assertEqual(p.port, None) + self.assertEqual(p.geturl(), b"http://www.python.org/javascript:alert('msg')/?query=something#fragment") + + # any scheme + url = "x-new-scheme\t://www.python\n.org\t/java\nscript:\talert('msg\r\n')/?query\n=\tsomething#frag\nment" + p = urllib.parse.urlsplit(url) + self.assertEqual(p.geturl(), "x-new-scheme://www.python.org/javascript:alert('msg')/?query=something#fragment") + + # Remove ASCII tabs and newlines from input as bytes, any scheme. + url = b"x-new-scheme\t://www.python\n.org\t/java\nscript:\talert('msg\r\n')/?query\n=\tsomething#frag\nment" + p = urllib.parse.urlsplit(url) + self.assertEqual(p.geturl(), b"x-new-scheme://www.python.org/javascript:alert('msg')/?query=something#fragment") + + # Unsafe bytes is not returned from urlparse cache. + # scheme is stored after parsing, sending an scheme with unsafe bytes *will not* return an unsafe scheme + url = "https://www.python\n.org\t/java\nscript:\talert('msg\r\n')/?query\n=\tsomething#frag\nment" + scheme = "htt\nps" + for _ in range(2): + p = urllib.parse.urlsplit(url, scheme=scheme) + self.assertEqual(p.scheme, "https") + self.assertEqual(p.geturl(), "https://www.python.org/javascript:alert('msg')/?query=something#fragment") + def test_attributes_bad_port(self): """Check handling of invalid ports.""" diff --git a/Lib/urllib/parse.py b/Lib/urllib/parse.py index c958313f..46aac6ad 100644 --- a/Lib/urllib/parse.py +++ b/Lib/urllib/parse.py @@ -80,6 +80,9 @@ scheme_chars = ('abcdefghijklmnopqrstuvwxyz' '0123456789' '+-.') +# Unsafe bytes to be removed per WHATWG spec +_UNSAFE_URL_BYTES_TO_REMOVE = ['\t', '\r', '\n'] + # Leading and trailing C0 control and space to be stripped per WHATWG spec. # == "".join([chr(i) for i in range(0, 0x20 + 1)]) _WHATWG_C0_CONTROL_OR_SPACE = '\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f ' @@ -415,6 +418,11 @@ def _checknetloc(netloc): raise ValueError("netloc '" + netloc2 + "' contains invalid " + "characters under NFKC normalization") +def _remove_unsafe_bytes_from_url(url): + for b in _UNSAFE_URL_BYTES_TO_REMOVE: + url = url.replace(b, "") + return url + def urlsplit(url, scheme='', allow_fragments=True): """Parse a URL into 5 components: :///?# @@ -426,6 +434,8 @@ def urlsplit(url, scheme='', allow_fragments=True): # (https://url.spec.whatwg.org/#concept-basic-url-parser would strip both) url = url.lstrip(_WHATWG_C0_CONTROL_OR_SPACE) scheme = scheme.strip(_WHATWG_C0_CONTROL_OR_SPACE) + url = _remove_unsafe_bytes_from_url(url) + scheme = _remove_unsafe_bytes_from_url(scheme) allow_fragments = bool(allow_fragments) key = url, scheme, allow_fragments, type(url), type(scheme) cached = _parse_cache.get(key, None)