From a0694f937b2d30a9c92ddd2c7d6c4d2053b4d385 Mon Sep 17 00:00:00 2001 From: Dan Winship Date: Sun, 14 Feb 2010 18:47:18 -0500 Subject: [PATCH] [SoupURI] tolerate bad %-encoding and other common sorts of URI lossage https://bugzilla.gnome.org/show_bug.cgi?id=590524 --- libsoup/soup-uri.c | 101 ++++++++++++++++++++++++++-------------------------- tests/uri-parsing.c | 21 +++++++---- 2 files changed, 66 insertions(+), 56 deletions(-) diff --git a/libsoup/soup-uri.c b/libsoup/soup-uri.c index 77312b7..b1e5e59 100644 --- a/libsoup/soup-uri.c +++ b/libsoup/soup-uri.c @@ -92,7 +92,7 @@ **/ static void append_uri_encoded (GString *str, const char *in, const char *extra_enc_chars); -static char *uri_decoded_copy (const char *str, int length); +static char *uri_decoded_copy (const char *str, int length, gboolean fixup); static char *uri_normalized_copy (const char *str, int length, const char *unescape_extra, gboolean fixup); gpointer _SOUP_URI_SCHEME_HTTP, _SOUP_URI_SCHEME_HTTPS; @@ -146,24 +146,38 @@ soup_uri_new_with_base (SoupURI *base, const char *uri_string) const char *end, *hash, *colon, *at, *path, *question; const char *p, *hostend; gboolean remove_dot_segments = TRUE; + int len; - uri = g_slice_new0 (SoupURI); - - /* See RFC 3986 for details. IF YOU CHANGE ANYTHING IN THIS - * FUNCTION, RUN tests/uri-parsing AFTERWARDS. + /* First some cleanup steps (which are supposed to all be no-ops, + * but...). Skip initial whitespace, strip out internal tabs and + * line breaks, and ignore trailing whitespace. */ + while (g_ascii_isspace (*uri_string)) + uri_string++; + + len = strcspn (uri_string, "\t\n\r"); + if (uri_string[len]) { + char *clean = g_strdup (uri_string), *bad; + + while ((bad = strpbrk (clean, "\t\n\r"))) + strcpy (bad, bad + 1); + uri = soup_uri_new_with_base (base, clean); + g_free (clean); + return uri; + } + end = uri_string + len; + while (end > uri_string && g_ascii_isspace (end[-1])) + end--; + + uri = g_slice_new0 (SoupURI); /* Find fragment. */ - end = hash = strchr (uri_string, '#'); - if (hash && hash[1]) { - uri->fragment = uri_normalized_copy (hash + 1, strlen (hash + 1), - NULL, FALSE); - if (!uri->fragment) { - soup_uri_free (uri); - return NULL; - } - } else - end = uri_string + strlen (uri_string); + hash = strchr (uri_string, '#'); + if (hash) { + uri->fragment = uri_normalized_copy (hash + 1, end - hash + 1, + NULL, TRUE); + end = hash; + } /* Find scheme: initial [a-z+.-]* substring until ":" */ p = uri_string; @@ -173,14 +187,10 @@ soup_uri_new_with_base (SoupURI *base, const char *uri_string) if (p > uri_string && *p == ':') { uri->scheme = soup_uri_get_scheme (uri_string, p - uri_string); - if (!uri->scheme) { - soup_uri_free (uri); - return NULL; - } uri_string = p + 1; } - if (!*uri_string && !base) + if (uri_string == end && !base && !uri->fragment) return uri; /* Check for authority */ @@ -193,22 +203,16 @@ soup_uri_new_with_base (SoupURI *base, const char *uri_string) colon = strchr (uri_string, ':'); if (colon && colon < at) { uri->password = uri_decoded_copy (colon + 1, - at - colon - 1); - if (!uri->password) { - soup_uri_free (uri); - return NULL; - } + at - colon - 1, + TRUE); } else { uri->password = NULL; colon = at; } uri->user = uri_decoded_copy (uri_string, - colon - uri_string); - if (!uri->user) { - soup_uri_free (uri); - return NULL; - } + colon - uri_string, + TRUE); uri_string = at + 1; } else uri->user = uri->password = NULL; @@ -230,11 +234,8 @@ soup_uri_new_with_base (SoupURI *base, const char *uri_string) hostend = colon ? colon : path; } - uri->host = uri_decoded_copy (uri_string, hostend - uri_string); - if (!uri->host) { - soup_uri_free (uri); - return NULL; - } + uri->host = uri_decoded_copy (uri_string, hostend - uri_string, + TRUE); if (colon && colon != path - 1) { char *portend; @@ -254,23 +255,15 @@ soup_uri_new_with_base (SoupURI *base, const char *uri_string) uri->query = uri_normalized_copy (question + 1, end - (question + 1), NULL, TRUE); - if (!uri->query) { - soup_uri_free (uri); - return NULL; - } end = question; } if (end != uri_string) { uri->path = uri_normalized_copy (uri_string, end - uri_string, NULL, TRUE); - if (!uri->path) { - soup_uri_free (uri); - return NULL; - } } - /* Apply base URI. Again, this is spelled out in RFC 3986. */ + /* Apply base URI. This is spelled out in RFC 3986. */ if (base && !uri->scheme && uri->host) uri->scheme = base->scheme; else if (base && !uri->scheme) { @@ -626,7 +619,7 @@ soup_uri_encode (const char *part, const char *escape_extra) #define HEXCHAR(s) ((XDIGIT (s[1]) << 4) + XDIGIT (s[2])) static char * -uri_decoded_copy (const char *part, int length) +uri_decoded_copy (const char *part, int length, gboolean fixup) { unsigned char *s, *d; char *decoded = g_strndup (part, length); @@ -636,8 +629,12 @@ uri_decoded_copy (const char *part, int length) if (*s == '%') { if (!g_ascii_isxdigit (s[1]) || !g_ascii_isxdigit (s[2])) { - g_free (decoded); - return NULL; + if (!fixup) { + g_free (decoded); + return NULL; + } + *d++ = *s; + continue; } *d++ = HEXCHAR (s); s += 2; @@ -660,7 +657,7 @@ uri_decoded_copy (const char *part, int length) char * soup_uri_decode (const char *part) { - return uri_decoded_copy (part, strlen (part)); + return uri_decoded_copy (part, strlen (part), FALSE); } static char * @@ -676,8 +673,12 @@ uri_normalized_copy (const char *part, int length, if (*s == '%') { if (!g_ascii_isxdigit (s[1]) || !g_ascii_isxdigit (s[2])) { - g_free (normalized); - return NULL; + if (!fixup) { + g_free (normalized); + return NULL; + } + *d++ = *s; + continue; } c = HEXCHAR (s); diff --git a/tests/uri-parsing.c b/tests/uri-parsing.c index c2e4b58..49a92a6 100644 --- a/tests/uri-parsing.c +++ b/tests/uri-parsing.c @@ -37,11 +37,6 @@ static struct { "http://delims/%3C%3E%23%25%22" }, { "http://unwise-chars/%7B%7D%7C%5C%5E%5B%5D%60", "http://unwise-chars/%7B%7D%7C%5C%5E%5B%5D%60" }, - { "http://host/path%", NULL }, - { "http://host/path%%", NULL }, - { "http://host/path%%%", NULL }, - { "http://host/path%/x/", NULL }, - { "http://host/path%0x/", NULL }, /* From RFC 2732 */ { "http://[FEDC:BA98:7654:3210:FEDC:BA98:7654:3210]:80/index.html", @@ -62,10 +57,24 @@ static struct { /* Try to recover certain kinds of invalid URIs */ { "http://host/path with spaces", "http://host/path%20with%20spaces" }, + { " http://host/path", "http://host/path" }, + { "http://host/path ", "http://host/path" }, + { "http://host/pa\nth", "http://host/path" }, + { "http:\r\n//host/path", "http://host/path" }, + { "http://\thost/path", "http://host/path" }, /* Bug 594405; 0-length is different from not-present */ { "http://host/path?", "http://host/path?" }, - { "http://host/path#", "http://host/path#" } + { "http://host/path#", "http://host/path#" }, + + /* Bug 590524; ignore badly-%-encoding */ + { "http://host/path%", "http://host/path%" }, + { "http://h%ost/path", "http://h%25ost/path" }, + { "http://host/path%%", "http://host/path%%" }, + { "http://host/path%%%", "http://host/path%%%" }, + { "http://host/path%/x/", "http://host/path%/x/" }, + { "http://host/path%0x/", "http://host/path%0x/" }, + { "http://host/path%ax", "http://host/path%ax" } }; static int num_abs_tests = G_N_ELEMENTS(abs_tests); -- 2.7.4