glib/gurifuncs.c

   1 /* GIO - GLib Input, Output and Streaming Library
   2  *
   3  * Copyright (C) 2006-2007 Red Hat, Inc.
   4  *
   5  * This library is free software; you can redistribute it and/or
   6  * modify it under the terms of the GNU Lesser General Public
   7  * License as published by the Free Software Foundation; either
   8  * version 2 of the License, or (at your option) any later version.
   9  *
  10  * This library is distributed in the hope that it will be useful,
  11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13  * Lesser General Public License for more details.
  14  *
  15  * You should have received a copy of the GNU Lesser General
  16  * Public License along with this library; if not, write to the
  17  * Free Software Foundation, Inc., 59 Temple Place, Suite 330,
  18  * Boston, MA 02111-1307, USA.
  19  *
  20  * Author: Alexander Larsson <alexl@redhat.com>
  21  */
  22
  23 #include "config.h"
  24 #include "gurifuncs.h"
  25 #include <glib.h>
  26 #include "string.h"
  27
  28 #include "galias.h"
  29
  30 /**
  31  * SECTION:gurifuncs
  32  * @short_description: URI Functions
  33  *
  34  * Functions for manipulating Universal Resource Identifiers (URIs) as
  35  * defined by <ulink url="http://www.ietf.org/rfc/rfc3986.txt">
  36  * RFC 3986</ulink>. It is highly recommended that you have read and
  37  * understand RFC 3986 for understanding this API.
  38  */
  39
  40 static int
  41 unescape_character (const char *scanner)
  42 {
  43   int first_digit;
  44   int second_digit;
  45
  46   first_digit = g_ascii_xdigit_value (*scanner++);
  47   if (first_digit < 0)
  48     return -1;
  49
  50   second_digit = g_ascii_xdigit_value (*scanner++);
  51   if (second_digit < 0)
  52     return -1;
  53
  54   return (first_digit << 4) | second_digit;
  55 }
  56
  57 /**
  58  * g_uri_unescape_segment:
  59  * @escaped_string: a string.
  60  * @escaped_string_end: a string.
  61  * @illegal_characters: an optional string of illegal characters not to be allowed.
  62  *
  63  * Unescapes a segment of an escaped string.
  64  *
  65  * If any of the characters in @illegal_characters or the character zero appears
  66  * as an escaped character in @escaped_string then that is an error and %NULL
  67  * will be returned. This is useful it you want to avoid for instance having a
  68  * slash being expanded in an escaped path element, which might confuse pathname
  69  * handling.
  70  *
  71  * Returns: an unescaped version of @escaped_string or %NULL on error.
  72  * The returned string should be freed when no longer needed.
  73  *
  74  * Since: 2.16
  75  **/
  76 char *
  77 g_uri_unescape_segment (const char *escaped_string,
  78                         const char *escaped_string_end,
  79                         const char *illegal_characters)
  80 {
  81   const char *in;
  82   char *out, *result;
  83   gint character;
  84
  85   if (escaped_string == NULL)
  86     return NULL;
  87
  88   if (escaped_string_end == NULL)
  89     escaped_string_end = escaped_string + strlen (escaped_string);
  90
  91   result = g_malloc (escaped_string_end - escaped_string + 1);
  92
  93   out = result;
  94   for (in = escaped_string; in < escaped_string_end; in++)
  95     {
  96       character = *in;
  97
  98       if (*in == '%')
  99         {
 100           in++;
 101
 102           if (escaped_string_end - in < 2)
 103             {
 104               /* Invalid escaped char (to short) */
 105               g_free (result);
 106               return NULL;
 107             }
 108
 109           character = unescape_character (in);
 110
 111           /* Check for an illegal character. We consider '\0' illegal here. */
 112           if (character <= 0 ||
 113               (illegal_characters != NULL &&
 114                strchr (illegal_characters, (char)character) != NULL))
 115             {
 116               g_free (result);
 117               return NULL;
 118             }
 119
 120           in++; /* The other char will be eaten in the loop header */
 121         }
 122       *out++ = (char)character;
 123     }
 124
 125   *out = '\0';
 126
 127   return result;
 128 }
 129
 130 /**
 131  * g_uri_unescape_string:
 132  * @escaped_string: an escaped string to be unescaped.
 133  * @illegal_characters: an optional string of illegal characters not to be allowed.
 134  *
 135  * Unescapes a whole escaped string.
 136  *
 137  * If any of the characters in @illegal_characters or the character zero appears
 138  * as an escaped character in @escaped_string then that is an error and %NULL
 139  * will be returned. This is useful it you want to avoid for instance having a
 140  * slash being expanded in an escaped path element, which might confuse pathname
 141  * handling.
 142  *
 143  * Returns: an unescaped version of @escaped_string. The returned string
 144  * should be freed when no longer needed.
 145  *
 146  * Since: 2.16
 147  **/
 148 char *
 149 g_uri_unescape_string (const char *escaped_string,
 150                        const char *illegal_characters)
 151 {
 152   return g_uri_unescape_segment (escaped_string, NULL, illegal_characters);
 153 }
 154
 155 /**
 156  * g_uri_parse_scheme:
 157  * @uri: a valid URI.
 158  *
 159  * Gets the scheme portion of a URI string. RFC 3986 decodes the scheme as:
 160  * <programlisting>
 161  * URI = scheme ":" hier-part [ "?" query ] [ "#" fragment ]
 162  * </programlisting>
 163  * Common schemes include "file", "http", "svn+ssh", etc.
 164  *
 165  * Returns: The "Scheme" component of the URI, or %NULL on error.
 166  * The returned string should be freed when no longer needed.
 167  *
 168  * Since: 2.16
 169  **/
 170 char *
 171 g_uri_parse_scheme (const char  *uri)
 172 {
 173   const char *p;
 174   char c;
 175
 176   g_return_val_if_fail (uri != NULL, NULL);
 177
 178   /* From RFC 3986 Decodes:
 179    * URI         = scheme ":" hier-part [ "?" query ] [ "#" fragment ]
 180    */
 181
 182   p = uri;
 183
 184   /* Decode scheme:
 185      scheme      = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." )
 186   */
 187
 188   if (!g_ascii_isalpha (*p))
 189     return NULL;
 190
 191   while (1)
 192     {
 193       c = *p++;
 194
 195       if (c == ':')
 196         break;
 197
 198       if (!(g_ascii_isalnum(c) ||
 199             c == '+' ||
 200             c == '-' ||
 201             c == '.'))
 202         return NULL;
 203     }
 204
 205   return g_strndup (uri, p - uri - 1);
 206 }
 207
 208 /**
 209  * g_uri_escape_string:
 210  * @unescaped: the unescaped input string.
 211  * @reserved_chars_allowed: a string of reserved characters that are
 212  *      allowed to be used.
 213  * @allow_utf8: %TRUE if the result can include UTF-8 characters.
 214  *
 215  * Escapes a string for use in a URI.
 216  *
 217  * Normally all characters that are not "unreserved" (i.e. ASCII alphanumerical
 218  * characters plus dash, dot, underscore and tilde) are escaped.
 219  * But if you specify characters in @reserved_chars_allowed they are not
 220  * escaped. This is useful for the "reserved" characters in the URI
 221  * specification, since those are allowed unescaped in some portions of
 222  * a URI.
 223  *
 224  * Returns: an escaped version of @unescaped. The returned string should be
 225  * freed when no longer needed.
 226  *
 227  * Since: 2.16
 228  **/
 229 char *
 230 g_uri_escape_string (const char *unescaped,
 231                      const char  *reserved_chars_allowed,
 232                      gboolean     allow_utf8)
 233 {
 234   GString *s;
 235
 236   g_return_val_if_fail (unescaped != NULL, NULL);
 237
 238   s = g_string_sized_new (strlen (unescaped) + 10);
 239
 240   g_string_append_uri_escaped (s, unescaped, reserved_chars_allowed, allow_utf8);
 241
 242   return g_string_free (s, FALSE);
 243 }
 244
 245 #define __G_URI_FUNCS_C__
 246 #include "galiasdef.c"