1 /* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
2 /* soup-uri.c : utility functions to parse URLs */
5 * Copyright 1999-2003 Ximian, Inc.
13 #include "soup-form.h"
14 #include "soup-misc.h"
18 * @short_description: URIs
20 * A #SoupURI represents a (parsed) URI.
22 * Many applications will not need to use #SoupURI directly at all; on
23 * the client side, soup_message_new() takes a stringified URI, and on
24 * the server side, the path and query components are provided for you
25 * in the server callback.
30 * @scheme: the URI scheme (eg, "http")
31 * @user: a username, or %NULL
32 * @password: a password, or %NULL
33 * @host: the hostname or IP address
34 * @port: the port number on @host
35 * @path: the path on @host
36 * @query: a query for @path, or %NULL
37 * @fragment: a fragment identifier within @path, or %NULL
39 * A #SoupURI represents a (parsed) URI. #SoupURI supports RFC 3986
40 * (URI Generic Syntax), and can parse any valid URI. However, libsoup
41 * only uses "http" and "https" URIs internally; You can use
42 * SOUP_URI_VALID_FOR_HTTP() to test if a #SoupURI is a valid HTTP
45 * @scheme will always be set in any URI. It is an interned string and
46 * is always all lowercase. (If you parse a URI with a non-lowercase
47 * scheme, it will be converted to lowercase.) The macros
48 * %SOUP_URI_SCHEME_HTTP and %SOUP_URI_SCHEME_HTTPS provide the
49 * interned values for "http" and "https" and can be compared against
52 * @user and @password are parsed as defined in the older URI specs
53 * (ie, separated by a colon; RFC 3986 only talks about a single
54 * "userinfo" field). Note that @password is not included in the
55 * output of soup_uri_to_string(). libsoup does not normally use these
56 * fields; authentication is handled via #SoupSession signals.
58 * @host contains the hostname, and @port the port specified in the
59 * URI. If the URI doesn't contain a hostname, @host will be %NULL,
60 * and if it doesn't specify a port, @port may be 0. However, for
61 * "http" and "https" URIs, @host is guaranteed to be non-%NULL
62 * (trying to parse an http URI with no @host will return %NULL), and
63 * @port will always be non-0 (because libsoup knows the default value
64 * to use when it is not specified in the URI).
66 * @path is always non-%NULL. For http/https URIs, @path will never be
67 * an empty string either; if the input URI has no path, the parsed
68 * #SoupURI will have a @path of "/".
70 * @query and @fragment are optional for all URI types.
71 * soup_form_decode_urlencoded() may be useful for parsing @query.
73 * Note that @path, @query, and @fragment may contain
74 * %<!-- -->-encoded characters. soup_uri_new() calls
75 * soup_uri_normalize() on them, but not soup_uri_decode(). This is
76 * necessary to ensure that soup_uri_to_string() will generate a URI
77 * that has exactly the same meaning as the original. (In theory,
78 * #SoupURI should leave @user, @password, and @host partially-encoded
79 * as well, but this would be more annoying than useful.)
83 * SOUP_URI_VALID_FOR_HTTP:
86 * Tests if @uri is a valid #SoupURI for HTTP communication; that is, if
87 * it can be used to construct a #SoupMessage.
89 * Return value: %TRUE if @uri is a valid "http" or "https" URI.
92 static void append_uri_encoded (GString *str, const char *in, const char *extra_enc_chars);
93 static char *uri_decoded_copy (const char *str, int length);
94 static char *uri_normalized_copy (const char *str, int length, const char *unescape_extra);
96 const char *_SOUP_URI_SCHEME_HTTP, *_SOUP_URI_SCHEME_HTTPS;
98 static inline const char *
99 soup_uri_get_scheme (const char *scheme, int len)
101 if (len == 4 && !strncmp (scheme, "http", 4)) {
102 return SOUP_URI_SCHEME_HTTP;
103 } else if (len == 5 && !strncmp (scheme, "https", 5)) {
104 return SOUP_URI_SCHEME_HTTPS;
108 lower_scheme = g_ascii_strdown (scheme, len);
109 scheme = g_intern_string (lower_scheme);
110 g_free (lower_scheme);
116 soup_scheme_default_port (const char *scheme)
118 if (scheme == SOUP_URI_SCHEME_HTTP)
120 else if (scheme == SOUP_URI_SCHEME_HTTPS)
127 * soup_uri_new_with_base:
129 * @uri_string: the URI
131 * Parses @uri_string relative to @base.
133 * Return value: a parsed #SoupURI.
136 soup_uri_new_with_base (SoupURI *base, const char *uri_string)
139 const char *end, *hash, *colon, *at, *path, *question;
140 const char *p, *hostend;
141 gboolean remove_dot_segments = TRUE;
143 uri = g_slice_new0 (SoupURI);
145 /* See RFC 3986 for details. IF YOU CHANGE ANYTHING IN THIS
146 * FUNCTION, RUN tests/uri-parsing AFTERWARDS.
150 end = hash = strchr (uri_string, '#');
151 if (hash && hash[1]) {
152 uri->fragment = uri_normalized_copy (hash + 1, strlen (hash + 1),
154 if (!uri->fragment) {
159 end = uri_string + strlen (uri_string);
161 /* Find scheme: initial [a-z+.-]* substring until ":" */
163 while (p < end && (g_ascii_isalnum (*p) ||
164 *p == '.' || *p == '+' || *p == '-'))
167 if (p > uri_string && *p == ':') {
168 uri->scheme = soup_uri_get_scheme (uri_string, p - uri_string);
176 if (!*uri_string && !base)
179 /* Check for authority */
180 if (strncmp (uri_string, "//", 2) == 0) {
183 path = uri_string + strcspn (uri_string, "/?#");
184 at = strchr (uri_string, '@');
185 if (at && at < path) {
186 colon = strchr (uri_string, ':');
187 if (colon && colon < at) {
188 uri->password = uri_decoded_copy (colon + 1,
190 if (!uri->password) {
195 uri->password = NULL;
199 uri->user = uri_decoded_copy (uri_string,
207 uri->user = uri->password = NULL;
209 /* Find host and port. */
210 if (*uri_string == '[') {
212 hostend = strchr (uri_string, ']');
213 if (!hostend || hostend > path) {
217 if (*(hostend + 1) == ':')
222 colon = memchr (uri_string, ':', path - uri_string);
223 hostend = colon ? colon : path;
226 uri->host = uri_decoded_copy (uri_string, hostend - uri_string);
232 if (colon && colon != path - 1) {
234 uri->port = strtoul (colon + 1, &portend, 10);
235 if (portend != (char *)path) {
245 question = memchr (uri_string, '?', end - uri_string);
248 uri->query = uri_normalized_copy (question + 1,
249 end - (question + 1),
259 if (end != uri_string) {
260 uri->path = uri_normalized_copy (uri_string, end - uri_string,
268 /* Apply base URI. Again, this is spelled out in RFC 3986. */
269 if (base && !uri->scheme && uri->host)
270 uri->scheme = base->scheme;
271 else if (base && !uri->scheme) {
272 uri->scheme = base->scheme;
273 uri->user = g_strdup (base->user);
274 uri->password = g_strdup (base->password);
275 uri->host = g_strdup (base->host);
276 uri->port = base->port;
279 uri->path = g_strdup (base->path);
281 uri->query = g_strdup (base->query);
282 remove_dot_segments = FALSE;
283 } else if (*uri->path != '/') {
284 char *newpath, *last;
286 last = strrchr (base->path, '/');
288 newpath = g_strdup_printf ("%.*s/%s",
289 (int)(last - base->path),
293 newpath = g_strdup_printf ("/%s", uri->path);
300 if (remove_dot_segments && uri->path && *uri->path) {
301 char *p = uri->path, *q;
303 /* Remove "./" where "." is a complete segment. */
304 for (p = uri->path + 1; *p; ) {
305 if (*(p - 1) == '/' &&
306 *p == '.' && *(p + 1) == '/')
307 memmove (p, p + 2, strlen (p + 2) + 1);
311 /* Remove "." at end. */
312 if (p > uri->path + 2 &&
313 *(p - 1) == '.' && *(p - 2) == '/')
316 /* Remove "<segment>/../" where <segment> != ".." */
317 for (p = uri->path + 1; *p; ) {
318 if (!strncmp (p, "../", 3)) {
322 q = strchr (p + 1, '/');
325 if (strncmp (q, "/../", 4) != 0) {
329 memmove (p, q + 4, strlen (q + 4) + 1);
332 /* Remove "<segment>/.." at end where <segment> != ".." */
333 q = strrchr (uri->path, '/');
334 if (q && !strcmp (q, "/..")) {
336 while (p > uri->path && *p != '/')
338 if (strncmp (p, "/../", 4) != 0)
342 /* Remove extraneous initial "/.."s */
343 while (!strncmp (uri->path, "/../", 4))
344 memmove (uri->path, uri->path + 3, strlen (uri->path) - 2);
345 if (!strcmp (uri->path, "/.."))
349 /* HTTP-specific stuff */
350 if (uri->scheme == SOUP_URI_SCHEME_HTTP ||
351 uri->scheme == SOUP_URI_SCHEME_HTTPS) {
352 if (!SOUP_URI_VALID_FOR_HTTP (uri)) {
357 uri->path = g_strdup ("/");
361 uri->port = soup_scheme_default_port (uri->scheme);
363 uri->path = g_strdup ("");
372 * Parses an absolute URI.
374 * You can also pass %NULL for @uri_string if you want to get back an
375 * "empty" #SoupURI that you can fill in by hand.
377 * Return value: a #SoupURI, or %NULL.
380 soup_uri_new (const char *uri_string)
385 return g_slice_new0 (SoupURI);
387 uri = soup_uri_new_with_base (NULL, uri_string);
400 * soup_uri_to_string:
402 * @just_path_and_query: if %TRUE, output just the path and query portions
404 * Returns a string representing @uri.
406 * If @just_path_and_query is %TRUE, this concatenates the path and query
407 * together. That is, it constructs the string that would be needed in
408 * the Request-Line of an HTTP request for @uri.
410 * Return value: a string representing @uri, which the caller must free.
413 soup_uri_to_string (SoupURI *uri, gboolean just_path_and_query)
418 /* IF YOU CHANGE ANYTHING IN THIS FUNCTION, RUN
419 * tests/uri-parsing AFTERWARD.
422 str = g_string_sized_new (20);
424 if (uri->scheme && !just_path_and_query)
425 g_string_sprintfa (str, "%s:", uri->scheme);
426 if (uri->host && !just_path_and_query) {
427 g_string_append (str, "//");
429 append_uri_encoded (str, uri->user, ":;@?/");
430 g_string_append_c (str, '@');
432 if (strchr (uri->host, ':')) {
433 g_string_append_c (str, '[');
434 g_string_append (str, uri->host);
435 g_string_append_c (str, ']');
437 append_uri_encoded (str, uri->host, ":/");
438 if (uri->port && uri->port != soup_scheme_default_port (uri->scheme))
439 g_string_append_printf (str, ":%d", uri->port);
440 if (!uri->path && (uri->query || uri->fragment))
441 g_string_append_c (str, '/');
444 if (uri->path && *uri->path)
445 g_string_append (str, uri->path);
448 g_string_append_c (str, '?');
449 g_string_append (str, uri->query);
451 if (uri->fragment && !just_path_and_query) {
452 g_string_append_c (str, '#');
453 g_string_append (str, uri->fragment);
456 return_result = str->str;
457 g_string_free (str, FALSE);
459 return return_result;
468 * Return value: a copy of @uri, which must be freed with soup_uri_free()
471 soup_uri_copy (SoupURI *uri)
475 g_return_val_if_fail (uri != NULL, NULL);
477 dup = g_slice_new0 (SoupURI);
478 dup->scheme = uri->scheme;
479 dup->user = g_strdup (uri->user);
480 dup->password = g_strdup (uri->password);
481 dup->host = g_strdup (uri->host);
482 dup->port = uri->port;
483 dup->path = g_strdup (uri->path);
484 dup->query = g_strdup (uri->query);
485 dup->fragment = g_strdup (uri->fragment);
490 static inline gboolean
491 parts_equal (const char *one, const char *two, gboolean insensitive)
497 return insensitive ? !g_ascii_strcasecmp (one, two) : !strcmp (one, two);
503 * @uri2: another #SoupURI
505 * Tests whether or not @uri1 and @uri2 are equal in all parts
507 * Return value: %TRUE or %FALSE
510 soup_uri_equal (SoupURI *uri1, SoupURI *uri2)
512 if (uri1->scheme != uri2->scheme ||
513 uri1->port != uri2->port ||
514 !parts_equal (uri1->user, uri2->user, FALSE) ||
515 !parts_equal (uri1->password, uri2->password, FALSE) ||
516 !parts_equal (uri1->host, uri2->host, TRUE) ||
517 !parts_equal (uri1->path, uri2->path, FALSE) ||
518 !parts_equal (uri1->query, uri2->query, FALSE) ||
519 !parts_equal (uri1->fragment, uri2->fragment, FALSE))
532 soup_uri_free (SoupURI *uri)
534 g_return_if_fail (uri != NULL);
537 g_free (uri->password);
541 g_free (uri->fragment);
543 g_slice_free (SoupURI, uri);
547 #define SOUP_URI_UNRESERVED 0
548 #define SOUP_URI_PCT_ENCODED 1
549 #define SOUP_URI_GEN_DELIMS 2
550 #define SOUP_URI_SUB_DELIMS 4
551 static const char uri_encoded_char[] = {
552 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x00 - 0x0f */
553 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x10 - 0x1f */
554 1, 4, 1, 2, 4, 1, 4, 4, 4, 4, 4, 4, 4, 0, 0, 2, /* !"#$%&'()*+,-./ */
555 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 4, 1, 4, 1, 2, /* 0123456789:;<=>? */
556 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* @ABCDEFGHIJKLMNO */
557 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 1, 2, 1, 0, /* PQRSTUVWXYZ[\]^_ */
558 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* `abcdefghijklmno */
559 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, /* pqrstuvwxyz{|}~ */
560 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
561 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
562 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
563 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
564 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
565 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
566 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
567 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
571 append_uri_encoded (GString *str, const char *in, const char *extra_enc_chars)
573 const unsigned char *s = (const unsigned char *)in;
576 if ((uri_encoded_char[*s] & (SOUP_URI_PCT_ENCODED | SOUP_URI_GEN_DELIMS)) ||
577 (extra_enc_chars && strchr (extra_enc_chars, *s)))
578 g_string_append_printf (str, "%%%02X", (int)*s++);
580 g_string_append_c (str, *s++);
587 * @escape_extra: additional reserved characters to escape (or %NULL)
589 * This %<!-- -->-encodes the given URI part and returns the escaped
590 * version in allocated memory, which the caller must free when it is
593 * Return value: the encoded URI part
596 soup_uri_encode (const char *part, const char *escape_extra)
601 str = g_string_new (NULL);
602 append_uri_encoded (str, part, escape_extra);
604 g_string_free (str, FALSE);
609 #define XDIGIT(c) ((c) <= '9' ? (c) - '0' : ((c) & 0x4F) - 'A' + 10)
610 #define HEXCHAR(s) ((XDIGIT (s[1]) << 4) + XDIGIT (s[2]))
613 uri_decoded_copy (const char *part, int length)
615 unsigned char *s, *d;
616 char *decoded = g_strndup (part, length);
618 s = d = (unsigned char *)decoded;
621 if (!g_ascii_isxdigit (s[1]) ||
622 !g_ascii_isxdigit (s[2])) {
639 * Fully %<!-- -->-decodes @part.
641 * Return value: the decoded URI part, or %NULL if an invalid percent
642 * code was encountered.
645 soup_uri_decode (const char *part)
647 return uri_decoded_copy (part, strlen (part));
651 uri_normalized_copy (const char *part, int length, const char *unescape_extra)
653 unsigned char *s, *d, c;
654 char *normalized = g_strndup (part, length);
656 s = d = (unsigned char *)normalized;
659 if (!g_ascii_isxdigit (s[1]) ||
660 !g_ascii_isxdigit (s[2])) {
666 if (uri_encoded_char[c] == SOUP_URI_UNRESERVED ||
667 (unescape_extra && strchr (unescape_extra, c))) {
672 *d++ = g_ascii_toupper (*s++);
673 *d++ = g_ascii_toupper (*s);
683 * soup_uri_normalize:
685 * @unescape_extra: reserved characters to unescape (or %NULL)
687 * %<!-- -->-decodes any "unreserved" characters (or characters in
688 * @unescape_extra) in @part.
690 * "Unreserved" characters are those that are not allowed to be used
691 * for punctuation according to the URI spec. For example, letters are
692 * unreserved, so soup_uri_normalize() will turn
693 * <literal>http://example.com/foo/b%<!-- -->61r</literal> into
694 * <literal>http://example.com/foo/bar</literal>, which is guaranteed
695 * to mean the same thing. However, "/" is "reserved", so
696 * <literal>http://example.com/foo%<!-- -->2Fbar</literal> would not
697 * be changed, because it might mean something different to the
700 * Return value: the normalized URI part, or %NULL if an invalid percent
701 * code was encountered.
704 soup_uri_normalize (const char *part, const char *unescape_extra)
706 return uri_normalized_copy (part, strlen (part), unescape_extra);
711 * soup_uri_uses_default_port:
714 * Tests if @uri uses the default port for its scheme. (Eg, 80 for
715 * http.) (This only works for http and https; libsoup does not know
716 * the default ports of other protocols.)
718 * Return value: %TRUE or %FALSE
721 soup_uri_uses_default_port (SoupURI *uri)
723 g_return_val_if_fail (uri->scheme == SOUP_URI_SCHEME_HTTP ||
724 uri->scheme == SOUP_URI_SCHEME_HTTPS, FALSE);
726 return uri->port == soup_scheme_default_port (uri->scheme);
730 * SOUP_URI_SCHEME_HTTP:
732 * "http" as an interned string. This can be compared directly against
733 * the value of a #SoupURI's <structfield>scheme</structfield>
737 * SOUP_URI_SCHEME_HTTPS:
739 * "https" as an interned string. This can be compared directly
740 * against the value of a #SoupURI's <structfield>scheme</structfield>
744 * soup_uri_set_scheme:
746 * @scheme: the URI scheme
748 * Sets @uri's scheme to @scheme. This will also set @uri's port to
749 * the default port for @scheme, if known.
752 soup_uri_set_scheme (SoupURI *uri, const char *scheme)
754 uri->scheme = soup_uri_get_scheme (scheme, strlen (scheme));
755 uri->port = soup_scheme_default_port (uri->scheme);
761 * @user: the username, or %NULL
763 * Sets @uri's user to @user.
766 soup_uri_set_user (SoupURI *uri, const char *user)
769 uri->user = g_strdup (user);
773 * soup_uri_set_password:
775 * @password: the password, or %NULL
777 * Sets @uri's password to @password.
780 soup_uri_set_password (SoupURI *uri, const char *password)
782 g_free (uri->password);
783 uri->password = g_strdup (password);
789 * @host: the hostname or IP address, or %NULL
791 * Sets @uri's host to @host.
793 * If @host is an IPv6 IP address, it should not include the brackets
794 * required by the URI syntax; they will be added automatically when
795 * converting @uri to a string.
798 soup_uri_set_host (SoupURI *uri, const char *host)
801 uri->host = g_strdup (host);
807 * @port: the port, or 0
809 * Sets @uri's port to @port. If @port is 0, @uri will not have an
810 * explicitly-specified port.
813 soup_uri_set_port (SoupURI *uri, guint port)
823 * Sets @uri's path to @path.
826 soup_uri_set_path (SoupURI *uri, const char *path)
829 uri->path = g_strdup (path);
833 * soup_uri_set_query:
837 * Sets @uri's query to @query.
840 soup_uri_set_query (SoupURI *uri, const char *query)
843 uri->query = g_strdup (query);
847 * soup_uri_set_query_from_form:
849 * @form: a #GHashTable containing HTML form information
851 * Sets @uri's query to the result of encoding @form according to the
852 * HTML form rules. See soup_form_encode_hash() for more information.
855 soup_uri_set_query_from_form (SoupURI *uri, GHashTable *form)
858 uri->query = soup_form_encode_urlencoded (form);
862 * soup_uri_set_query_from_fields:
864 * @first_field: name of the first form field to encode into query
865 * @...: value of @first_field, followed by additional field names
866 * and values, terminated by %NULL.
868 * Sets @uri's query to the result of encoding the given form fields
869 * and values according to the * HTML form rules. See
870 * soup_form_encode() for more information.
873 soup_uri_set_query_from_fields (SoupURI *uri,
874 const char *first_field,
880 va_start (args, first_field);
881 uri->query = soup_form_encode_valist (first_field, args);
886 * soup_uri_set_fragment:
888 * @fragment: the fragment
890 * Sets @uri's fragment to @fragment.
893 soup_uri_set_fragment (SoupURI *uri, const char *fragment)
895 g_free (uri->fragment);
896 uri->fragment = g_strdup (fragment);
901 soup_uri_get_type (void)
903 static volatile gsize type_volatile = 0;
905 if (g_once_init_enter (&type_volatile)) {
906 GType type = g_boxed_type_register_static (
907 g_intern_static_string ("SoupURI"),
908 (GBoxedCopyFunc) soup_uri_copy,
909 (GBoxedFreeFunc) soup_uri_free);
910 g_once_init_leave (&type_volatile, type);
912 return type_volatile;