glib/gurifuncs.c

   1 /* GIO - GLib Input, Output and Streaming Library
   2  *
   3  * Copyright (C) 2006-2007 Red Hat, Inc.
   4  *
   5  * This library is free software; you can redistribute it and/or
   6  * modify it under the terms of the GNU Lesser General Public
   7  * License as published by the Free Software Foundation; either
   8  * version 2 of the License, or (at your option) any later version.
   9  *
  10  * This library is distributed in the hope that it will be useful,
  11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13  * Lesser General Public License for more details.
  14  *
  15  * You should have received a copy of the GNU Lesser General
  16  * Public License along with this library; if not, write to the
  17  * Free Software Foundation, Inc., 59 Temple Place, Suite 330,
  18  * Boston, MA 02111-1307, USA.
  19  *
  20  * Author: Alexander Larsson <alexl@redhat.com>
  21  */
  22
  23 #include "config.h"
  24
  25 #include "gurifuncs.h"
  26
  27 #include <glib/gstrfuncs.h>
  28 #include <glib/gmessages.h>
  29 #include <glib/gstring.h>
  30 #include <glib/gmem.h>
  31
  32 #include <string.h>
  33
  34 #include "config.h"
  35
  36 /**
  37  * SECTION:gurifuncs
  38  * @title: URI Functions
  39  * @short_description: manipulating URIs
  40  *
  41  * Functions for manipulating Universal Resource Identifiers (URIs) as
  42  * defined by <ulink url="http://www.ietf.org/rfc/rfc3986.txt">
  43  * RFC 3986</ulink>. It is highly recommended that you have read and
  44  * understand RFC 3986 for understanding this API.
  45  */
  46
  47 static int
  48 unescape_character (const char *scanner)
  49 {
  50   int first_digit;
  51   int second_digit;
  52
  53   first_digit = g_ascii_xdigit_value (*scanner++);
  54   if (first_digit < 0)
  55     return -1;
  56
  57   second_digit = g_ascii_xdigit_value (*scanner++);
  58   if (second_digit < 0)
  59     return -1;
  60
  61   return (first_digit << 4) | second_digit;
  62 }
  63
  64 /**
  65  * g_uri_unescape_segment:
  66  * @escaped_string: (allow-none): A string, may be %NULL
  67  * @escaped_string_end: (allow-none): Pointer to end of @escaped_string, may be %NULL
  68  * @illegal_characters: (allow-none): An optional string of illegal characters not to be allowed, may be %NULL
  69  *
  70  * Unescapes a segment of an escaped string.
  71  *
  72  * If any of the characters in @illegal_characters or the character zero appears
  73  * as an escaped character in @escaped_string then that is an error and %NULL
  74  * will be returned. This is useful it you want to avoid for instance having a
  75  * slash being expanded in an escaped path element, which might confuse pathname
  76  * handling.
  77  *
  78  * Returns: an unescaped version of @escaped_string or %NULL on error.
  79  * The returned string should be freed when no longer needed.  As a
  80  * special case if %NULL is given for @escaped_string, this function
  81  * will return %NULL.
  82  *
  83  * Since: 2.16
  84  **/
  85 char *
  86 g_uri_unescape_segment (const char *escaped_string,
  87                         const char *escaped_string_end,
  88                         const char *illegal_characters)
  89 {
  90   const char *in;
  91   char *out, *result;
  92   gint character;
  93
  94   if (escaped_string == NULL)
  95     return NULL;
  96
  97   if (escaped_string_end == NULL)
  98     escaped_string_end = escaped_string + strlen (escaped_string);
  99
 100   result = g_malloc (escaped_string_end - escaped_string + 1);
 101
 102   out = result;
 103   for (in = escaped_string; in < escaped_string_end; in++)
 104     {
 105       character = *in;
 106
 107       if (*in == '%')
 108         {
 109           in++;
 110
 111           if (escaped_string_end - in < 2)
 112             {
 113               /* Invalid escaped char (to short) */
 114               g_free (result);
 115               return NULL;
 116             }
 117
 118           character = unescape_character (in);
 119
 120           /* Check for an illegal character. We consider '\0' illegal here. */
 121           if (character <= 0 ||
 122               (illegal_characters != NULL &&
 123                strchr (illegal_characters, (char)character) != NULL))
 124             {
 125               g_free (result);
 126               return NULL;
 127             }
 128
 129           in++; /* The other char will be eaten in the loop header */
 130         }
 131       *out++ = (char)character;
 132     }
 133
 134   *out = '\0';
 135
 136   return result;
 137 }
 138
 139 /**
 140  * g_uri_unescape_string:
 141  * @escaped_string: an escaped string to be unescaped.
 142  * @illegal_characters: (allow-none): a string of illegal characters not to be
 143  *      allowed, or %NULL.
 144  *
 145  * Unescapes a whole escaped string.
 146  *
 147  * If any of the characters in @illegal_characters or the character zero appears
 148  * as an escaped character in @escaped_string then that is an error and %NULL
 149  * will be returned. This is useful it you want to avoid for instance having a
 150  * slash being expanded in an escaped path element, which might confuse pathname
 151  * handling.
 152  *
 153  * Returns: an unescaped version of @escaped_string. The returned string
 154  * should be freed when no longer needed.
 155  *
 156  * Since: 2.16
 157  **/
 158 char *
 159 g_uri_unescape_string (const char *escaped_string,
 160                        const char *illegal_characters)
 161 {
 162   return g_uri_unescape_segment (escaped_string, NULL, illegal_characters);
 163 }
 164
 165 /**
 166  * g_uri_parse_scheme:
 167  * @uri: a valid URI.
 168  *
 169  * Gets the scheme portion of a URI string. RFC 3986 decodes the scheme as:
 170  * <programlisting>
 171  * URI = scheme ":" hier-part [ "?" query ] [ "#" fragment ]
 172  * </programlisting>
 173  * Common schemes include "file", "http", "svn+ssh", etc.
 174  *
 175  * Returns: The "Scheme" component of the URI, or %NULL on error.
 176  * The returned string should be freed when no longer needed.
 177  *
 178  * Since: 2.16
 179  **/
 180 char *
 181 g_uri_parse_scheme (const char  *uri)
 182 {
 183   const char *p;
 184   char c;
 185
 186   g_return_val_if_fail (uri != NULL, NULL);
 187
 188   /* From RFC 3986 Decodes:
 189    * URI         = scheme ":" hier-part [ "?" query ] [ "#" fragment ]
 190    */
 191
 192   p = uri;
 193
 194   /* Decode scheme:
 195      scheme      = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." )
 196   */
 197
 198   if (!g_ascii_isalpha (*p))
 199     return NULL;
 200
 201   while (1)
 202     {
 203       c = *p++;
 204
 205       if (c == ':')
 206         break;
 207
 208       if (!(g_ascii_isalnum(c) ||
 209             c == '+' ||
 210             c == '-' ||
 211             c == '.'))
 212         return NULL;
 213     }
 214
 215   return g_strndup (uri, p - uri - 1);
 216 }
 217
 218 /**
 219  * g_uri_escape_string:
 220  * @unescaped: the unescaped input string.
 221  * @reserved_chars_allowed: (allow-none): a string of reserved characters that
 222  *      are allowed to be used, or %NULL.
 223  * @allow_utf8: %TRUE if the result can include UTF-8 characters.
 224  *
 225  * Escapes a string for use in a URI.
 226  *
 227  * Normally all characters that are not "unreserved" (i.e. ASCII alphanumerical
 228  * characters plus dash, dot, underscore and tilde) are escaped.
 229  * But if you specify characters in @reserved_chars_allowed they are not
 230  * escaped. This is useful for the "reserved" characters in the URI
 231  * specification, since those are allowed unescaped in some portions of
 232  * a URI.
 233  *
 234  * Returns: an escaped version of @unescaped. The returned string should be
 235  * freed when no longer needed.
 236  *
 237  * Since: 2.16
 238  **/
 239 char *
 240 g_uri_escape_string (const char *unescaped,
 241                      const char  *reserved_chars_allowed,
 242                      gboolean     allow_utf8)
 243 {
 244   GString *s;
 245
 246   g_return_val_if_fail (unescaped != NULL, NULL);
 247
 248   s = g_string_sized_new (strlen (unescaped) + 10);
 249
 250   g_string_append_uri_escaped (s, unescaped, reserved_chars_allowed, allow_utf8);
 251
 252   return g_string_free (s, FALSE);
 253 }