[SoupURI] tolerate bad %-encoding and other common sorts of URI lossage

author Dan Winship <danw@gnome.org>

Sun, 14 Feb 2010 23:47:18 +0000 (18:47 -0500)

committer Dan Winship <danw@gnome.org>

Sun, 14 Feb 2010 23:47:18 +0000 (18:47 -0500)
author Dan Winship <danw@gnome.org>
Sun, 14 Feb 2010 23:47:18 +0000 (18:47 -0500)
committer Dan Winship <danw@gnome.org>
Sun, 14 Feb 2010 23:47:18 +0000 (18:47 -0500)
diff --git a/libsoup/soup-uri.c b/libsoup/soup-uri.c

index 77312b7..b1e5e59 100644 (file)
--- a/libsoup/soup-uri.c
+++ b/libsoup/soup-uri.c
@@ -92,7 +92,7 @@
   **/
  
  static void append_uri_encoded (GString *str, const char *in, const char *extra_enc_chars);
-static char *uri_decoded_copy (const char *str, int length);
+static char *uri_decoded_copy (const char *str, int length, gboolean fixup);
  static char *uri_normalized_copy (const char *str, int length, const char *unescape_extra, gboolean fixup);
  
  gpointer _SOUP_URI_SCHEME_HTTP, _SOUP_URI_SCHEME_HTTPS;
@@ -146,24 +146,38 @@ soup_uri_new_with_base (SoupURI *base, const char *uri_string)
         const char *end, *hash, *colon, *at, *path, *question;
         const char *p, *hostend;
         gboolean remove_dot_segments = TRUE;
+       int len;
  
-       uri = g_slice_new0 (SoupURI);
-
-       /* See RFC 3986 for details. IF YOU CHANGE ANYTHING IN THIS
-        * FUNCTION, RUN tests/uri-parsing AFTERWARDS.
+       /* First some cleanup steps (which are supposed to all be no-ops,
+        * but...). Skip initial whitespace, strip out internal tabs and
+        * line breaks, and ignore trailing whitespace.
          */
+       while (g_ascii_isspace (*uri_string))
+               uri_string++;
+
+       len = strcspn (uri_string, "\t\n\r");
+       if (uri_string[len]) {
+               char *clean = g_strdup (uri_string), *bad;
+
+               while ((bad = strpbrk (clean, "\t\n\r")))
+                       strcpy (bad, bad + 1);
+               uri = soup_uri_new_with_base (base, clean);
+               g_free (clean);
+               return uri;
+       }
+       end = uri_string + len;
+       while (end > uri_string && g_ascii_isspace (end[-1]))
+               end--;
+
+       uri = g_slice_new0 (SoupURI);
  
         /* Find fragment. */
-       end = hash = strchr (uri_string, '#');
-       if (hash && hash[1]) {
-               uri->fragment = uri_normalized_copy (hash + 1, strlen (hash + 1),
-                                                    NULL, FALSE);
-               if (!uri->fragment) {
-                       soup_uri_free (uri);
-                       return NULL;
-               }
-       } else
-               end = uri_string + strlen (uri_string);
+       hash = strchr (uri_string, '#');
+       if (hash) {
+               uri->fragment = uri_normalized_copy (hash + 1, end - hash + 1,
+                                                    NULL, TRUE);
+               end = hash;
+       }
  
         /* Find scheme: initial [a-z+.-]* substring until ":" */
         p = uri_string;
@@ -173,14 +187,10 @@ soup_uri_new_with_base (SoupURI *base, const char *uri_string)
  
         if (p > uri_string && *p == ':') {
                 uri->scheme = soup_uri_get_scheme (uri_string, p - uri_string);
-               if (!uri->scheme) {
-                       soup_uri_free (uri);
-                       return NULL;
-               }
                 uri_string = p + 1;
         }
  
-       if (!*uri_string && !base)
+       if (uri_string == end && !base && !uri->fragment)
                 return uri;
  
         /* Check for authority */
@@ -193,22 +203,16 @@ soup_uri_new_with_base (SoupURI *base, const char *uri_string)
                         colon = strchr (uri_string, ':');
                         if (colon && colon < at) {
                                 uri->password = uri_decoded_copy (colon + 1,
-                                                                 at - colon - 1);
-                               if (!uri->password) {
-                                       soup_uri_free (uri);
-                                       return NULL;
-                               }
+                                                                 at - colon - 1,
+                                                                 TRUE);
                         } else {
                                 uri->password = NULL;
                                 colon = at;
                         }
  
                         uri->user = uri_decoded_copy (uri_string,
-                                                     colon - uri_string);
-                       if (!uri->user) {
-                               soup_uri_free (uri);
-                               return NULL;
-                       }
+                                                     colon - uri_string,
+                                                     TRUE);
                         uri_string = at + 1;
                 } else
                         uri->user = uri->password = NULL;
@@ -230,11 +234,8 @@ soup_uri_new_with_base (SoupURI *base, const char *uri_string)
                         hostend = colon ? colon : path;
                 }
  
-               uri->host = uri_decoded_copy (uri_string, hostend - uri_string);
-               if (!uri->host) {
-                       soup_uri_free (uri);
-                       return NULL;
-               }
+               uri->host = uri_decoded_copy (uri_string, hostend - uri_string,
+                                             TRUE);
  
                 if (colon && colon != path - 1) {
                         char *portend;
@@ -254,23 +255,15 @@ soup_uri_new_with_base (SoupURI *base, const char *uri_string)
                 uri->query = uri_normalized_copy (question + 1,
                                                   end - (question + 1),
                                                   NULL, TRUE);
-               if (!uri->query) {
-                       soup_uri_free (uri);
-                       return NULL;
-               }
                 end = question;
         }
  
         if (end != uri_string) {
                 uri->path = uri_normalized_copy (uri_string, end - uri_string,
                                                  NULL, TRUE);
-               if (!uri->path) {
-                       soup_uri_free (uri);
-                       return NULL;
-               }
         }
  
-       /* Apply base URI. Again, this is spelled out in RFC 3986. */
+       /* Apply base URI. This is spelled out in RFC 3986. */
         if (base && !uri->scheme && uri->host)
                 uri->scheme = base->scheme;
         else if (base && !uri->scheme) {
@@ -626,7 +619,7 @@ soup_uri_encode (const char *part, const char *escape_extra)
  #define HEXCHAR(s) ((XDIGIT (s[1]) << 4) + XDIGIT (s[2]))
  
  static char *
-uri_decoded_copy (const char *part, int length)
+uri_decoded_copy (const char *part, int length, gboolean fixup)
  {
         unsigned char *s, *d;
         char *decoded = g_strndup (part, length);
@@ -636,8 +629,12 @@ uri_decoded_copy (const char *part, int length)
                 if (*s == '%') {
                         if (!g_ascii_isxdigit (s[1]) ||
                             !g_ascii_isxdigit (s[2])) {
-                               g_free (decoded);
-                               return NULL;
+                               if (!fixup) {
+                                       g_free (decoded);
+                                       return NULL;
+                               }
+                               *d++ = *s;
+                               continue;
                         }
                         *d++ = HEXCHAR (s);
                         s += 2;
@@ -660,7 +657,7 @@ uri_decoded_copy (const char *part, int length)
  char *
  soup_uri_decode (const char *part)
  {
-       return uri_decoded_copy (part, strlen (part));
+       return uri_decoded_copy (part, strlen (part), FALSE);
  }
  
  static char *
@@ -676,8 +673,12 @@ uri_normalized_copy (const char *part, int length,
                 if (*s == '%') {
                         if (!g_ascii_isxdigit (s[1]) ||
                             !g_ascii_isxdigit (s[2])) {
-                               g_free (normalized);
-                               return NULL;
+                               if (!fixup) {
+                                       g_free (normalized);
+                                       return NULL;
+                               }
+                               *d++ = *s;
+                               continue;
                         }
  
                         c = HEXCHAR (s);
diff --git a/tests/uri-parsing.c b/tests/uri-parsing.c

index c2e4b58..49a92a6 100644 (file)
--- a/tests/uri-parsing.c
+++ b/tests/uri-parsing.c
@@ -37,11 +37,6 @@ static struct {
           "http://delims/%3C%3E%23%25%22" },
         { "http://unwise-chars/%7B%7D%7C%5C%5E%5B%5D%60",
           "http://unwise-chars/%7B%7D%7C%5C%5E%5B%5D%60" },
-       { "http://host/path%", NULL },
-       { "http://host/path%%", NULL },
-       { "http://host/path%%%", NULL },
-       { "http://host/path%/x/", NULL },
-       { "http://host/path%0x/", NULL },
  
         /* From RFC 2732 */
         { "http://[FEDC:BA98:7654:3210:FEDC:BA98:7654:3210]:80/index.html",
@@ -62,10 +57,24 @@ static struct {
         /* Try to recover certain kinds of invalid URIs */
         { "http://host/path with spaces",
           "http://host/path%20with%20spaces" },
+       { "  http://host/path", "http://host/path" },
+       { "http://host/path  ", "http://host/path" },
+       { "http://host/pa\nth", "http://host/path" },
+       { "http:\r\n//host/path", "http://host/path" },
+       { "http://\thost/path", "http://host/path" },
  
         /* Bug 594405; 0-length is different from not-present */
         { "http://host/path?", "http://host/path?" },
-       { "http://host/path#", "http://host/path#" }
+       { "http://host/path#", "http://host/path#" },
+
+       /* Bug 590524; ignore badly-%-encoding */
+       { "http://host/path%", "http://host/path%" },
+       { "http://h%ost/path", "http://h%25ost/path" },
+       { "http://host/path%%", "http://host/path%%" },
+       { "http://host/path%%%", "http://host/path%%%" },
+       { "http://host/path%/x/", "http://host/path%/x/" },
+       { "http://host/path%0x/", "http://host/path%0x/" },
+       { "http://host/path%ax", "http://host/path%ax" }
  };
  static int num_abs_tests = G_N_ELEMENTS(abs_tests);
author	Dan Winship <danw@gnome.org>
	Sun, 14 Feb 2010 23:47:18 +0000 (18:47 -0500)
committer	Dan Winship <danw@gnome.org>
	Sun, 14 Feb 2010 23:47:18 +0000 (18:47 -0500)
libsoup/soup-uri.c		patch \| blob \| history
tests/uri-parsing.c		patch \| blob \| history