Fixed URI encoding of reserved characters (Issue #369).
authorMatt Giuca <matt.giuca@gmail.com>
Tue, 14 Feb 2012 01:51:03 +0000 (12:51 +1100)
committerMatt Giuca <matt.giuca@gmail.com>
Tue, 14 Feb 2012 01:51:03 +0000 (12:51 +1100)
Previously, util.requote_path would unquote and requote all characters,
causing reserved characters to become encoded (changing the semantics of the
URI). Now, it has special code for unquoting just the unreserved characters,
then quotes only illegal characters.
This ensures that illegal characters are fixed, and URIs are normalised, but
reserved characters do not erroneously become quoted.
Test case test_session_with_escaped_url now passes.

requests/utils.py

index a773f10..f4f98c4 100644 (file)
@@ -396,6 +396,28 @@ def stream_decompress(iterator, mode='gzip'):
         if rv:
             yield rv
 
+# The unreserved URI characters (RFC 3986)
+UNRESERVED_SET = frozenset(
+    "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
+    + "0123456789-._~")
+
+def unquote_unreserved(uri):
+    """Un-escape any percent-escape sequences in a URI that are unreserved
+    characters.
+    This leaves all reserved, illegal and non-ASCII bytes encoded.
+    """
+    parts = uri.split('%')
+    for i in range(1, len(parts)):
+        h = parts[i][0:2]
+        if len(h) == 2:
+            c = chr(int(h, 16))
+            if c in UNRESERVED_SET:
+                parts[i] = c + parts[i][2:]
+            else:
+                parts[i] = '%' + parts[i]
+        else:
+            parts[i] = '%' + parts[i]
+    return ''.join(parts)
 
 def requote_path(path):
     """Re-quote the given URL path component.
@@ -404,5 +426,9 @@ def requote_path(path):
     ensure that it is fully and consistently quoted.
     """
     parts = path.split("/")
-    parts = (quote(unquote(part), safe="") for part in parts)
+    # Unquote only the unreserved characters
+    # Then quote only illegal characters (do not quote reserved, unreserved,
+    # or '%')
+    parts = (quote(unquote_unreserved(part), safe="!#$%&'()*+,/:;=?@[]~")
+             for part in parts)
     return "/".join(parts)