**/
static void append_uri_encoded (GString *str, const char *in, const char *extra_enc_chars);
-static char *uri_decoded_copy (const char *str, int length);
+static char *uri_decoded_copy (const char *str, int length, gboolean fixup);
static char *uri_normalized_copy (const char *str, int length, const char *unescape_extra, gboolean fixup);
gpointer _SOUP_URI_SCHEME_HTTP, _SOUP_URI_SCHEME_HTTPS;
const char *end, *hash, *colon, *at, *path, *question;
const char *p, *hostend;
gboolean remove_dot_segments = TRUE;
+ int len;
- uri = g_slice_new0 (SoupURI);
-
- /* See RFC 3986 for details. IF YOU CHANGE ANYTHING IN THIS
- * FUNCTION, RUN tests/uri-parsing AFTERWARDS.
+ /* First some cleanup steps (which are supposed to all be no-ops,
+ * but...). Skip initial whitespace, strip out internal tabs and
+ * line breaks, and ignore trailing whitespace.
*/
+ while (g_ascii_isspace (*uri_string))
+ uri_string++;
+
+ len = strcspn (uri_string, "\t\n\r");
+ if (uri_string[len]) {
+ char *clean = g_strdup (uri_string), *bad;
+
+ while ((bad = strpbrk (clean, "\t\n\r")))
+ strcpy (bad, bad + 1);
+ uri = soup_uri_new_with_base (base, clean);
+ g_free (clean);
+ return uri;
+ }
+ end = uri_string + len;
+ while (end > uri_string && g_ascii_isspace (end[-1]))
+ end--;
+
+ uri = g_slice_new0 (SoupURI);
/* Find fragment. */
- end = hash = strchr (uri_string, '#');
- if (hash && hash[1]) {
- uri->fragment = uri_normalized_copy (hash + 1, strlen (hash + 1),
- NULL, FALSE);
- if (!uri->fragment) {
- soup_uri_free (uri);
- return NULL;
- }
- } else
- end = uri_string + strlen (uri_string);
+ hash = strchr (uri_string, '#');
+ if (hash) {
+ uri->fragment = uri_normalized_copy (hash + 1, end - hash + 1,
+ NULL, TRUE);
+ end = hash;
+ }
/* Find scheme: initial [a-z+.-]* substring until ":" */
p = uri_string;
if (p > uri_string && *p == ':') {
uri->scheme = soup_uri_get_scheme (uri_string, p - uri_string);
- if (!uri->scheme) {
- soup_uri_free (uri);
- return NULL;
- }
uri_string = p + 1;
}
- if (!*uri_string && !base)
+ if (uri_string == end && !base && !uri->fragment)
return uri;
/* Check for authority */
colon = strchr (uri_string, ':');
if (colon && colon < at) {
uri->password = uri_decoded_copy (colon + 1,
- at - colon - 1);
- if (!uri->password) {
- soup_uri_free (uri);
- return NULL;
- }
+ at - colon - 1,
+ TRUE);
} else {
uri->password = NULL;
colon = at;
}
uri->user = uri_decoded_copy (uri_string,
- colon - uri_string);
- if (!uri->user) {
- soup_uri_free (uri);
- return NULL;
- }
+ colon - uri_string,
+ TRUE);
uri_string = at + 1;
} else
uri->user = uri->password = NULL;
hostend = colon ? colon : path;
}
- uri->host = uri_decoded_copy (uri_string, hostend - uri_string);
- if (!uri->host) {
- soup_uri_free (uri);
- return NULL;
- }
+ uri->host = uri_decoded_copy (uri_string, hostend - uri_string,
+ TRUE);
if (colon && colon != path - 1) {
char *portend;
uri->query = uri_normalized_copy (question + 1,
end - (question + 1),
NULL, TRUE);
- if (!uri->query) {
- soup_uri_free (uri);
- return NULL;
- }
end = question;
}
if (end != uri_string) {
uri->path = uri_normalized_copy (uri_string, end - uri_string,
NULL, TRUE);
- if (!uri->path) {
- soup_uri_free (uri);
- return NULL;
- }
}
- /* Apply base URI. Again, this is spelled out in RFC 3986. */
+ /* Apply base URI. This is spelled out in RFC 3986. */
if (base && !uri->scheme && uri->host)
uri->scheme = base->scheme;
else if (base && !uri->scheme) {
#define HEXCHAR(s) ((XDIGIT (s[1]) << 4) + XDIGIT (s[2]))
static char *
-uri_decoded_copy (const char *part, int length)
+uri_decoded_copy (const char *part, int length, gboolean fixup)
{
unsigned char *s, *d;
char *decoded = g_strndup (part, length);
if (*s == '%') {
if (!g_ascii_isxdigit (s[1]) ||
!g_ascii_isxdigit (s[2])) {
- g_free (decoded);
- return NULL;
+ if (!fixup) {
+ g_free (decoded);
+ return NULL;
+ }
+ *d++ = *s;
+ continue;
}
*d++ = HEXCHAR (s);
s += 2;
char *
soup_uri_decode (const char *part)
{
- return uri_decoded_copy (part, strlen (part));
+ return uri_decoded_copy (part, strlen (part), FALSE);
}
static char *
if (*s == '%') {
if (!g_ascii_isxdigit (s[1]) ||
!g_ascii_isxdigit (s[2])) {
- g_free (normalized);
- return NULL;
+ if (!fixup) {
+ g_free (normalized);
+ return NULL;
+ }
+ *d++ = *s;
+ continue;
}
c = HEXCHAR (s);
"http://delims/%3C%3E%23%25%22" },
{ "http://unwise-chars/%7B%7D%7C%5C%5E%5B%5D%60",
"http://unwise-chars/%7B%7D%7C%5C%5E%5B%5D%60" },
- { "http://host/path%", NULL },
- { "http://host/path%%", NULL },
- { "http://host/path%%%", NULL },
- { "http://host/path%/x/", NULL },
- { "http://host/path%0x/", NULL },
/* From RFC 2732 */
{ "http://[FEDC:BA98:7654:3210:FEDC:BA98:7654:3210]:80/index.html",
/* Try to recover certain kinds of invalid URIs */
{ "http://host/path with spaces",
"http://host/path%20with%20spaces" },
+ { " http://host/path", "http://host/path" },
+ { "http://host/path ", "http://host/path" },
+ { "http://host/pa\nth", "http://host/path" },
+ { "http:\r\n//host/path", "http://host/path" },
+ { "http://\thost/path", "http://host/path" },
/* Bug 594405; 0-length is different from not-present */
{ "http://host/path?", "http://host/path?" },
- { "http://host/path#", "http://host/path#" }
+ { "http://host/path#", "http://host/path#" },
+
+ /* Bug 590524; ignore badly-%-encoding */
+ { "http://host/path%", "http://host/path%" },
+ { "http://h%ost/path", "http://h%25ost/path" },
+ { "http://host/path%%", "http://host/path%%" },
+ { "http://host/path%%%", "http://host/path%%%" },
+ { "http://host/path%/x/", "http://host/path%/x/" },
+ { "http://host/path%0x/", "http://host/path%0x/" },
+ { "http://host/path%ax", "http://host/path%ax" }
};
static int num_abs_tests = G_N_ELEMENTS(abs_tests);