From a0694f937b2d30a9c92ddd2c7d6c4d2053b4d385 Mon Sep 17 00:00:00 2001
From: Dan Winship <danw@gnome.org>
Date: Sun, 14 Feb 2010 18:47:18 -0500
Subject: [PATCH] [SoupURI] tolerate bad %-encoding and other common sorts of
 URI lossage

https://bugzilla.gnome.org/show_bug.cgi?id=590524
---
 libsoup/soup-uri.c  | 101 ++++++++++++++++++++++++++--------------------------
 tests/uri-parsing.c |  21 +++++++----
 2 files changed, 66 insertions(+), 56 deletions(-)

diff --git a/libsoup/soup-uri.c b/libsoup/soup-uri.c
index 77312b7..b1e5e59 100644
--- a/libsoup/soup-uri.c
+++ b/libsoup/soup-uri.c
@@ -92,7 +92,7 @@
  **/
 
 static void append_uri_encoded (GString *str, const char *in, const char *extra_enc_chars);
-static char *uri_decoded_copy (const char *str, int length);
+static char *uri_decoded_copy (const char *str, int length, gboolean fixup);
 static char *uri_normalized_copy (const char *str, int length, const char *unescape_extra, gboolean fixup);
 
 gpointer _SOUP_URI_SCHEME_HTTP, _SOUP_URI_SCHEME_HTTPS;
@@ -146,24 +146,38 @@ soup_uri_new_with_base (SoupURI *base, const char *uri_string)
 	const char *end, *hash, *colon, *at, *path, *question;
 	const char *p, *hostend;
 	gboolean remove_dot_segments = TRUE;
+	int len;
 
-	uri = g_slice_new0 (SoupURI);
-
-	/* See RFC 3986 for details. IF YOU CHANGE ANYTHING IN THIS
-	 * FUNCTION, RUN tests/uri-parsing AFTERWARDS.
+	/* First some cleanup steps (which are supposed to all be no-ops,
+	 * but...). Skip initial whitespace, strip out internal tabs and
+	 * line breaks, and ignore trailing whitespace.
 	 */
+	while (g_ascii_isspace (*uri_string))
+		uri_string++;
+
+	len = strcspn (uri_string, "\t\n\r");
+	if (uri_string[len]) {
+		char *clean = g_strdup (uri_string), *bad;
+
+		while ((bad = strpbrk (clean, "\t\n\r")))
+			strcpy (bad, bad + 1);
+		uri = soup_uri_new_with_base (base, clean);
+		g_free (clean);
+		return uri;
+	}
+	end = uri_string + len;
+	while (end > uri_string && g_ascii_isspace (end[-1]))
+		end--;
+
+	uri = g_slice_new0 (SoupURI);
 
 	/* Find fragment. */
-	end = hash = strchr (uri_string, '#');
-	if (hash && hash[1]) {
-		uri->fragment = uri_normalized_copy (hash + 1, strlen (hash + 1),
-						     NULL, FALSE);
-		if (!uri->fragment) {
-			soup_uri_free (uri);
-			return NULL;
-		}
-	} else
-		end = uri_string + strlen (uri_string);
+	hash = strchr (uri_string, '#');
+	if (hash) {
+		uri->fragment = uri_normalized_copy (hash + 1, end - hash + 1,
+						     NULL, TRUE);
+		end = hash;
+	}
 
 	/* Find scheme: initial [a-z+.-]* substring until ":" */
 	p = uri_string;
@@ -173,14 +187,10 @@ soup_uri_new_with_base (SoupURI *base, const char *uri_string)
 
 	if (p > uri_string && *p == ':') {
 		uri->scheme = soup_uri_get_scheme (uri_string, p - uri_string);
-		if (!uri->scheme) {
-			soup_uri_free (uri);
-			return NULL;
-		}
 		uri_string = p + 1;
 	}
 
-	if (!*uri_string && !base)
+	if (uri_string == end && !base && !uri->fragment)
 		return uri;
 
 	/* Check for authority */
@@ -193,22 +203,16 @@ soup_uri_new_with_base (SoupURI *base, const char *uri_string)
 			colon = strchr (uri_string, ':');
 			if (colon && colon < at) {
 				uri->password = uri_decoded_copy (colon + 1,
-								  at - colon - 1);
-				if (!uri->password) {
-					soup_uri_free (uri);
-					return NULL;
-				}
+								  at - colon - 1,
+								  TRUE);
 			} else {
 				uri->password = NULL;
 				colon = at;
 			}
 
 			uri->user = uri_decoded_copy (uri_string,
-						      colon - uri_string);
-			if (!uri->user) {
-				soup_uri_free (uri);
-				return NULL;
-			}
+						      colon - uri_string,
+						      TRUE);
 			uri_string = at + 1;
 		} else
 			uri->user = uri->password = NULL;
@@ -230,11 +234,8 @@ soup_uri_new_with_base (SoupURI *base, const char *uri_string)
 			hostend = colon ? colon : path;
 		}
 
-		uri->host = uri_decoded_copy (uri_string, hostend - uri_string);
-		if (!uri->host) {
-			soup_uri_free (uri);
-			return NULL;
-		}
+		uri->host = uri_decoded_copy (uri_string, hostend - uri_string,
+					      TRUE);
 
 		if (colon && colon != path - 1) {
 			char *portend;
@@ -254,23 +255,15 @@ soup_uri_new_with_base (SoupURI *base, const char *uri_string)
 		uri->query = uri_normalized_copy (question + 1,
 						  end - (question + 1),
 						  NULL, TRUE);
-		if (!uri->query) {
-			soup_uri_free (uri);
-			return NULL;
-		}
 		end = question;
 	}
 
 	if (end != uri_string) {
 		uri->path = uri_normalized_copy (uri_string, end - uri_string,
 						 NULL, TRUE);
-		if (!uri->path) {
-			soup_uri_free (uri);
-			return NULL;
-		}
 	}
 
-	/* Apply base URI. Again, this is spelled out in RFC 3986. */
+	/* Apply base URI. This is spelled out in RFC 3986. */
 	if (base && !uri->scheme && uri->host)
 		uri->scheme = base->scheme;
 	else if (base && !uri->scheme) {
@@ -626,7 +619,7 @@ soup_uri_encode (const char *part, const char *escape_extra)
 #define HEXCHAR(s) ((XDIGIT (s[1]) << 4) + XDIGIT (s[2]))
 
 static char *
-uri_decoded_copy (const char *part, int length)
+uri_decoded_copy (const char *part, int length, gboolean fixup)
 {
 	unsigned char *s, *d;
 	char *decoded = g_strndup (part, length);
@@ -636,8 +629,12 @@ uri_decoded_copy (const char *part, int length)
 		if (*s == '%') {
 			if (!g_ascii_isxdigit (s[1]) ||
 			    !g_ascii_isxdigit (s[2])) {
-				g_free (decoded);
-				return NULL;
+				if (!fixup) {
+					g_free (decoded);
+					return NULL;
+				}
+				*d++ = *s;
+				continue;
 			}
 			*d++ = HEXCHAR (s);
 			s += 2;
@@ -660,7 +657,7 @@ uri_decoded_copy (const char *part, int length)
 char *
 soup_uri_decode (const char *part)
 {
-	return uri_decoded_copy (part, strlen (part));
+	return uri_decoded_copy (part, strlen (part), FALSE);
 }
 
 static char *
@@ -676,8 +673,12 @@ uri_normalized_copy (const char *part, int length,
 		if (*s == '%') {
 			if (!g_ascii_isxdigit (s[1]) ||
 			    !g_ascii_isxdigit (s[2])) {
-				g_free (normalized);
-				return NULL;
+				if (!fixup) {
+					g_free (normalized);
+					return NULL;
+				}
+				*d++ = *s;
+				continue;
 			}
 
 			c = HEXCHAR (s);
diff --git a/tests/uri-parsing.c b/tests/uri-parsing.c
index c2e4b58..49a92a6 100644
--- a/tests/uri-parsing.c
+++ b/tests/uri-parsing.c
@@ -37,11 +37,6 @@ static struct {
 	  "http://delims/%3C%3E%23%25%22" },
 	{ "http://unwise-chars/%7B%7D%7C%5C%5E%5B%5D%60",
 	  "http://unwise-chars/%7B%7D%7C%5C%5E%5B%5D%60" },
-	{ "http://host/path%", NULL },
-	{ "http://host/path%%", NULL },
-	{ "http://host/path%%%", NULL },
-	{ "http://host/path%/x/", NULL },
-	{ "http://host/path%0x/", NULL },
 
 	/* From RFC 2732 */
 	{ "http://[FEDC:BA98:7654:3210:FEDC:BA98:7654:3210]:80/index.html",
@@ -62,10 +57,24 @@ static struct {
 	/* Try to recover certain kinds of invalid URIs */
 	{ "http://host/path with spaces",
 	  "http://host/path%20with%20spaces" },
+	{ "  http://host/path", "http://host/path" },
+	{ "http://host/path  ", "http://host/path" },
+	{ "http://host/pa\nth", "http://host/path" },
+	{ "http:\r\n//host/path", "http://host/path" },
+	{ "http://\thost/path", "http://host/path" },
 
 	/* Bug 594405; 0-length is different from not-present */
 	{ "http://host/path?", "http://host/path?" },
-	{ "http://host/path#", "http://host/path#" }
+	{ "http://host/path#", "http://host/path#" },
+
+	/* Bug 590524; ignore badly-%-encoding */
+	{ "http://host/path%", "http://host/path%" },
+	{ "http://h%ost/path", "http://h%25ost/path" },
+	{ "http://host/path%%", "http://host/path%%" },
+	{ "http://host/path%%%", "http://host/path%%%" },
+	{ "http://host/path%/x/", "http://host/path%/x/" },
+	{ "http://host/path%0x/", "http://host/path%0x/" },
+	{ "http://host/path%ax", "http://host/path%ax" }
 };
 static int num_abs_tests = G_N_ELEMENTS(abs_tests);
 
-- 
2.7.4