glib/gconvert.c

   1 /* GLIB - Library of useful routines for C programming
   2  *
   3  * gconvert.c: Convert between character sets using iconv
   4  * Copyright Red Hat Inc., 2000
   5  * Authors: Havoc Pennington <hp@redhat.com>, Owen Taylor <otaylor@redhat.com
   6  *
   7  * This library is free software; you can redistribute it and/or
   8  * modify it under the terms of the GNU Lesser General Public
   9  * License as published by the Free Software Foundation; either
  10  * version 2 of the License, or (at your option) any later version.
  11  *
  12  * This library is distributed in the hope that it will be useful,
  13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15  * Lesser General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU Lesser General Public
  18  * License along with this library; if not, write to the
  19  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  20  * Boston, MA 02111-1307, USA.
  21  */
  22
  23 #include "config.h"
  24
  25 #include <iconv.h>
  26 #include <errno.h>
  27 #include <stdio.h>
  28 #include <string.h>
  29 #include <stdlib.h>
  30
  31 #include "glib.h"
  32 #include "gprintfint.h"
  33 #include "gthreadinit.h"
  34
  35 #ifdef G_PLATFORM_WIN32
  36 #define STRICT
  37 #include <windows.h>
  38 #undef STRICT
  39 #endif
  40
  41 #include "glibintl.h"
  42
  43 #if defined(USE_LIBICONV_GNU) && !defined (_LIBICONV_H)
  44 #error GNU libiconv in use but included iconv.h not from libiconv
  45 #endif
  46 #if !defined(USE_LIBICONV_GNU) && defined (_LIBICONV_H)
  47 #error GNU libiconv not in use but included iconv.h is from libiconv
  48 #endif
  49
  50 #include "galias.h"
  51
  52 GQuark
  53 g_convert_error_quark (void)
  54 {
  55   return g_quark_from_static_string ("g_convert_error");
  56 }
  57
  58 static gboolean
  59 try_conversion (const char *to_codeset,
  60                 const char *from_codeset,
  61                 iconv_t    *cd)
  62 {
  63   *cd = iconv_open (to_codeset, from_codeset);
  64
  65   if (*cd == (iconv_t)-1 && errno == EINVAL)
  66     return FALSE;
  67   else
  68     return TRUE;
  69 }
  70
  71 static gboolean
  72 try_to_aliases (const char **to_aliases,
  73                 const char  *from_codeset,
  74                 iconv_t     *cd)
  75 {
  76   if (to_aliases)
  77     {
  78       const char **p = to_aliases;
  79       while (*p)
  80         {
  81           if (try_conversion (*p, from_codeset, cd))
  82             return TRUE;
  83
  84           p++;
  85         }
  86     }
  87
  88   return FALSE;
  89 }
  90
  91 extern const char **_g_charset_get_aliases (const char *canonical_name) G_GNUC_INTERNAL;
  92
  93 /**
  94  * g_iconv_open:
  95  * @to_codeset: destination codeset
  96  * @from_codeset: source codeset
  97  *
  98  * Same as the standard UNIX routine iconv_open(), but
  99  * may be implemented via libiconv on UNIX flavors that lack
 100  * a native implementation.
 101  *
 102  * GLib provides g_convert() and g_locale_to_utf8() which are likely
 103  * more convenient than the raw iconv wrappers.
 104  *
 105  * Return value: a "conversion descriptor", or (GIConv)-1 if
 106  *  opening the converter failed.
 107  **/
 108 GIConv
 109 g_iconv_open (const gchar  *to_codeset,
 110               const gchar  *from_codeset)
 111 {
 112   iconv_t cd;
 113
 114   if (!try_conversion (to_codeset, from_codeset, &cd))
 115     {
 116       const char **to_aliases = _g_charset_get_aliases (to_codeset);
 117       const char **from_aliases = _g_charset_get_aliases (from_codeset);
 118
 119       if (from_aliases)
 120         {
 121           const char **p = from_aliases;
 122           while (*p)
 123             {
 124               if (try_conversion (to_codeset, *p, &cd))
 125                 goto out;
 126
 127               if (try_to_aliases (to_aliases, *p, &cd))
 128                 goto out;
 129
 130               p++;
 131             }
 132         }
 133
 134       if (try_to_aliases (to_aliases, from_codeset, &cd))
 135         goto out;
 136     }
 137
 138  out:
 139   return (cd == (iconv_t)-1) ? (GIConv)-1 : (GIConv)cd;
 140 }
 141
 142 /**
 143  * g_iconv:
 144  * @converter: conversion descriptor from g_iconv_open()
 145  * @inbuf: bytes to convert
 146  * @inbytes_left: inout parameter, bytes remaining to convert in @inbuf
 147  * @outbuf: converted output bytes
 148  * @outbytes_left: inout parameter, bytes available to fill in @outbuf
 149  *
 150  * Same as the standard UNIX routine iconv(), but
 151  * may be implemented via libiconv on UNIX flavors that lack
 152  * a native implementation.
 153  *
 154  * GLib provides g_convert() and g_locale_to_utf8() which are likely
 155  * more convenient than the raw iconv wrappers.
 156  *
 157  * Return value: count of non-reversible conversions, or -1 on error
 158  **/
 159 size_t
 160 g_iconv (GIConv   converter,
 161          gchar  **inbuf,
 162          gsize   *inbytes_left,
 163          gchar  **outbuf,
 164          gsize   *outbytes_left)
 165 {
 166   iconv_t cd = (iconv_t)converter;
 167
 168   return iconv (cd, inbuf, inbytes_left, outbuf, outbytes_left);
 169 }
 170
 171 /**
 172  * g_iconv_close:
 173  * @converter: a conversion descriptor from g_iconv_open()
 174  *
 175  * Same as the standard UNIX routine iconv_close(), but
 176  * may be implemented via libiconv on UNIX flavors that lack
 177  * a native implementation. Should be called to clean up
 178  * the conversion descriptor from g_iconv_open() when
 179  * you are done converting things.
 180  *
 181  * GLib provides g_convert() and g_locale_to_utf8() which are likely
 182  * more convenient than the raw iconv wrappers.
 183  *
 184  * Return value: -1 on error, 0 on success
 185  **/
 186 gint
 187 g_iconv_close (GIConv converter)
 188 {
 189   iconv_t cd = (iconv_t)converter;
 190
 191   return iconv_close (cd);
 192 }
 193
 194
 195 #ifdef NEED_ICONV_CACHE
 196
 197 #define ICONV_CACHE_SIZE   (16)
 198
 199 struct _iconv_cache_bucket {
 200   gchar *key;
 201   guint32 refcount;
 202   gboolean used;
 203   GIConv cd;
 204 };
 205
 206 static GList *iconv_cache_list;
 207 static GHashTable *iconv_cache;
 208 static GHashTable *iconv_open_hash;
 209 static guint iconv_cache_size = 0;
 210 G_LOCK_DEFINE_STATIC (iconv_cache_lock);
 211
 212 /* caller *must* hold the iconv_cache_lock */
 213 static void
 214 iconv_cache_init (void)
 215 {
 216   static gboolean initialized = FALSE;
 217
 218   if (initialized)
 219     return;
 220
 221   iconv_cache_list = NULL;
 222   iconv_cache = g_hash_table_new (g_str_hash, g_str_equal);
 223   iconv_open_hash = g_hash_table_new (g_direct_hash, g_direct_equal);
 224
 225   initialized = TRUE;
 226 }
 227
 228
 229 /**
 230  * iconv_cache_bucket_new:
 231  * @key: cache key
 232  * @cd: iconv descriptor
 233  *
 234  * Creates a new cache bucket, inserts it into the cache and
 235  * increments the cache size.
 236  *
 237  * Returns a pointer to the newly allocated cache bucket.
 238  **/
 239 static struct _iconv_cache_bucket *
 240 iconv_cache_bucket_new (const gchar *key, GIConv cd)
 241 {
 242   struct _iconv_cache_bucket *bucket;
 243
 244   bucket = g_new (struct _iconv_cache_bucket, 1);
 245   bucket->key = g_strdup (key);
 246   bucket->refcount = 1;
 247   bucket->used = TRUE;
 248   bucket->cd = cd;
 249
 250   g_hash_table_insert (iconv_cache, bucket->key, bucket);
 251
 252   /* FIXME: if we sorted the list so items with few refcounts were
 253      first, then we could expire them faster in iconv_cache_expire_unused () */
 254   iconv_cache_list = g_list_prepend (iconv_cache_list, bucket);
 255
 256   iconv_cache_size++;
 257
 258   return bucket;
 259 }
 260
 261
 262 /**
 263  * iconv_cache_bucket_expire:
 264  * @node: cache bucket's node
 265  * @bucket: cache bucket
 266  *
 267  * Expires a single cache bucket @bucket. This should only ever be
 268  * called on a bucket that currently has no used iconv descriptors
 269  * open.
 270  *
 271  * @node is not a required argument. If @node is not supplied, we
 272  * search for it ourselves.
 273  **/
 274 static void
 275 iconv_cache_bucket_expire (GList *node, struct _iconv_cache_bucket *bucket)
 276 {
 277   g_hash_table_remove (iconv_cache, bucket->key);
 278
 279   if (node == NULL)
 280     node = g_list_find (iconv_cache_list, bucket);
 281
 282   g_assert (node != NULL);
 283
 284   if (node->prev)
 285     {
 286       node->prev->next = node->next;
 287       if (node->next)
 288         node->next->prev = node->prev;
 289     }
 290   else
 291     {
 292       iconv_cache_list = node->next;
 293       if (node->next)
 294         node->next->prev = NULL;
 295     }
 296
 297   g_list_free_1 (node);
 298
 299   g_free (bucket->key);
 300   g_iconv_close (bucket->cd);
 301   g_free (bucket);
 302
 303   iconv_cache_size--;
 304 }
 305
 306
 307 /**
 308  * iconv_cache_expire_unused:
 309  *
 310  * Expires as many unused cache buckets as it needs to in order to get
 311  * the total number of buckets < ICONV_CACHE_SIZE.
 312  **/
 313 static void
 314 iconv_cache_expire_unused (void)
 315 {
 316   struct _iconv_cache_bucket *bucket;
 317   GList *node, *next;
 318
 319   node = iconv_cache_list;
 320   while (node && iconv_cache_size >= ICONV_CACHE_SIZE)
 321     {
 322       next = node->next;
 323
 324       bucket = node->data;
 325       if (bucket->refcount == 0)
 326         iconv_cache_bucket_expire (node, bucket);
 327
 328       node = next;
 329     }
 330 }
 331
 332 static GIConv
 333 open_converter (const gchar *to_codeset,
 334                 const gchar *from_codeset,
 335                 GError     **error)
 336 {
 337   struct _iconv_cache_bucket *bucket;
 338   gchar *key;
 339   GIConv cd;
 340
 341   /* create our key */
 342   key = g_alloca (strlen (from_codeset) + strlen (to_codeset) + 2);
 343   _g_sprintf (key, "%s:%s", from_codeset, to_codeset);
 344
 345   G_LOCK (iconv_cache_lock);
 346
 347   /* make sure the cache has been initialized */
 348   iconv_cache_init ();
 349
 350   bucket = g_hash_table_lookup (iconv_cache, key);
 351   if (bucket)
 352     {
 353       if (bucket->used)
 354         {
 355           cd = g_iconv_open (to_codeset, from_codeset);
 356           if (cd == (GIConv) -1)
 357             goto error;
 358         }
 359       else
 360         {
 361           /* Apparently iconv on Solaris <= 7 segfaults if you pass in
 362            * NULL for anything but inbuf; work around that. (NULL outbuf
 363            * or NULL *outbuf is allowed by Unix98.)
 364            */
 365           gsize inbytes_left = 0;
 366           gchar *outbuf = NULL;
 367           gsize outbytes_left = 0;
 368
 369           cd = bucket->cd;
 370           bucket->used = TRUE;
 371
 372           /* reset the descriptor */
 373           g_iconv (cd, NULL, &inbytes_left, &outbuf, &outbytes_left);
 374         }
 375
 376       bucket->refcount++;
 377     }
 378   else
 379     {
 380       cd = g_iconv_open (to_codeset, from_codeset);
 381       if (cd == (GIConv) -1)
 382         goto error;
 383
 384       iconv_cache_expire_unused ();
 385
 386       bucket = iconv_cache_bucket_new (key, cd);
 387     }
 388
 389   g_hash_table_insert (iconv_open_hash, cd, bucket->key);
 390
 391   G_UNLOCK (iconv_cache_lock);
 392
 393   return cd;
 394
 395  error:
 396
 397   G_UNLOCK (iconv_cache_lock);
 398
 399   /* Something went wrong.  */
 400   if (error)
 401     {
 402       if (errno == EINVAL)
 403         g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_NO_CONVERSION,
 404                      _("Conversion from character set '%s' to '%s' is not supported"),
 405                      from_codeset, to_codeset);
 406       else
 407         g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_FAILED,
 408                      _("Could not open converter from '%s' to '%s'"),
 409                      from_codeset, to_codeset);
 410     }
 411
 412   return cd;
 413 }
 414
 415 static int
 416 close_converter (GIConv converter)
 417 {
 418   struct _iconv_cache_bucket *bucket;
 419   const gchar *key;
 420   GIConv cd;
 421
 422   cd = converter;
 423
 424   if (cd == (GIConv) -1)
 425     return 0;
 426
 427   G_LOCK (iconv_cache_lock);
 428
 429   key = g_hash_table_lookup (iconv_open_hash, cd);
 430   if (key)
 431     {
 432       g_hash_table_remove (iconv_open_hash, cd);
 433
 434       bucket = g_hash_table_lookup (iconv_cache, key);
 435       g_assert (bucket);
 436
 437       bucket->refcount--;
 438
 439       if (cd == bucket->cd)
 440         bucket->used = FALSE;
 441       else
 442         g_iconv_close (cd);
 443
 444       if (!bucket->refcount && iconv_cache_size > ICONV_CACHE_SIZE)
 445         {
 446           /* expire this cache bucket */
 447           iconv_cache_bucket_expire (NULL, bucket);
 448         }
 449     }
 450   else
 451     {
 452       G_UNLOCK (iconv_cache_lock);
 453
 454       g_warning ("This iconv context wasn't opened using open_converter");
 455
 456       return g_iconv_close (converter);
 457     }
 458
 459   G_UNLOCK (iconv_cache_lock);
 460
 461   return 0;
 462 }
 463
 464 #else  /* !NEED_ICONV_CACHE */
 465
 466 static GIConv
 467 open_converter (const gchar *to_codeset,
 468                 const gchar *from_codeset,
 469                 GError     **error)
 470 {
 471   GIConv cd;
 472
 473   cd = g_iconv_open (to_codeset, from_codeset);
 474
 475   if (cd == (GIConv) -1)
 476     {
 477       /* Something went wrong.  */
 478       if (error)
 479         {
 480           if (errno == EINVAL)
 481             g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_NO_CONVERSION,
 482                          _("Conversion from character set '%s' to '%s' is not supported"),
 483                          from_codeset, to_codeset);
 484           else
 485             g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_FAILED,
 486                          _("Could not open converter from '%s' to '%s'"),
 487                          from_codeset, to_codeset);
 488         }
 489     }
 490
 491   return cd;
 492 }
 493
 494 static int
 495 close_converter (GIConv cd)
 496 {
 497   if (cd == (GIConv) -1)
 498     return 0;
 499
 500   return g_iconv_close (cd);
 501 }
 502
 503 #endif /* NEED_ICONV_CACHE */
 504
 505 /**
 506  * g_convert_with_iconv:
 507  * @str:           the string to convert
 508  * @len:           the length of the string, or -1 if the string is
 509  *                 nul-terminated<footnoteref linkend="nul-unsafe"/>.
 510  * @converter:     conversion descriptor from g_iconv_open()
 511  * @bytes_read:    location to store the number of bytes in the
 512  *                 input string that were successfully converted, or %NULL.
 513  *                 Even if the conversion was successful, this may be
 514  *                 less than @len if there were partial characters
 515  *                 at the end of the input. If the error
 516  *                 #G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value
 517  *                 stored will the byte offset after the last valid
 518  *                 input sequence.
 519  * @bytes_written: the number of bytes stored in the output buffer (not
 520  *                 including the terminating nul).
 521  * @error:         location to store the error occuring, or %NULL to ignore
 522  *                 errors. Any of the errors in #GConvertError may occur.
 523  *
 524  * Converts a string from one character set to another.
 525  *
 526  * Note that you should use g_iconv() for streaming
 527  * conversions<footnote id="streaming-state">
 528  *  <para>
 529  * Despite the fact that @byes_read can return information about partial
 530  * characters, the <literal>g_convert_...</literal> functions
 531  * are not generally suitable for streaming. If the underlying converter
 532  * being used maintains internal state, then this won't be preserved
 533  * across successive calls to g_convert(), g_convert_with_iconv() or
 534  * g_convert_with_fallback(). (An example of this is the GNU C converter
 535  * for CP1255 which does not emit a base character until it knows that
 536  * the next character is not a mark that could combine with the base
 537  * character.)
 538  *  </para>
 539  * </footnote>.
 540  *
 541  * Return value: If the conversion was successful, a newly allocated
 542  *               nul-terminated string, which must be freed with
 543  *               g_free(). Otherwise %NULL and @error will be set.
 544  **/
 545 gchar*
 546 g_convert_with_iconv (const gchar *str,
 547                       gssize       len,
 548                       GIConv       converter,
 549                       gsize       *bytes_read,
 550                       gsize       *bytes_written,
 551                       GError     **error)
 552 {
 553   gchar *dest;
 554   gchar *outp;
 555   const gchar *p;
 556   const gchar *shift_p = NULL;
 557   gsize inbytes_remaining;
 558   gsize outbytes_remaining;
 559   gsize err;
 560   gsize outbuf_size;
 561   gboolean have_error = FALSE;
 562   gboolean done = FALSE;
 563
 564   g_return_val_if_fail (converter != (GIConv) -1, NULL);
 565
 566   if (len < 0)
 567     len = strlen (str);
 568
 569   p = str;
 570   inbytes_remaining = len;
 571   outbuf_size = len + 1; /* + 1 for nul in case len == 1 */
 572
 573   outbytes_remaining = outbuf_size - 1; /* -1 for nul */
 574   outp = dest = g_malloc (outbuf_size);
 575
 576   while (!done && !have_error)
 577     {
 578       err = g_iconv (converter, (char **)&p, &inbytes_remaining, &outp, &outbytes_remaining);
 579
 580       if (err == (size_t) -1)
 581         {
 582           switch (errno)
 583             {
 584             case EINVAL:
 585               /* Incomplete text, do not report an error */
 586               done = TRUE;
 587               break;
 588             case E2BIG:
 589               {
 590                 size_t used = outp - dest;
 591
 592                 outbuf_size *= 2;
 593                 dest = g_realloc (dest, outbuf_size);
 594
 595                 outp = dest + used;
 596                 outbytes_remaining = outbuf_size - used - 1; /* -1 for nul */
 597               }
 598               break;
 599             case EILSEQ:
 600               if (error)
 601                 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
 602                              _("Invalid byte sequence in conversion input"));
 603               have_error = TRUE;
 604               break;
 605             default:
 606               if (error)
 607                 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_FAILED,
 608                              _("Error during conversion: %s"),
 609                              g_strerror (errno));
 610               have_error = TRUE;
 611               break;
 612             }
 613         }
 614       else
 615         {
 616           if (!shift_p)
 617             {
 618               /* call g_iconv with NULL inbuf to cleanup shift state */
 619               shift_p = p;
 620               p = NULL;
 621               inbytes_remaining = 0;
 622             }
 623           else
 624             done = TRUE;
 625         }
 626     }
 627
 628   if (shift_p)
 629     p = shift_p;
 630
 631   *outp = '\0';
 632
 633   if (bytes_read)
 634     *bytes_read = p - str;
 635   else
 636     {
 637       if ((p - str) != len)
 638         {
 639           if (!have_error)
 640             {
 641               if (error)
 642                 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT,
 643                              _("Partial character sequence at end of input"));
 644               have_error = TRUE;
 645             }
 646         }
 647     }
 648
 649   if (bytes_written)
 650     *bytes_written = outp - dest;       /* Doesn't include '\0' */
 651
 652   if (have_error)
 653     {
 654       g_free (dest);
 655       return NULL;
 656     }
 657   else
 658     return dest;
 659 }
 660
 661 /**
 662  * g_convert:
 663  * @str:           the string to convert
 664  * @len:           the length of the string, or -1 if the string is
 665  *                 nul-terminated<footnote id="nul-unsafe">
 666                      <para>
 667                        Note that some encodings may allow nul bytes to
 668                        occur inside strings. In that case, using -1 for
 669                        the @len parameter is unsafe.
 670                      </para>
 671                    </footnote>.
 672  * @to_codeset:    name of character set into which to convert @str
 673  * @from_codeset:  character set of @str.
 674  * @bytes_read:    location to store the number of bytes in the
 675  *                 input string that were successfully converted, or %NULL.
 676  *                 Even if the conversion was successful, this may be
 677  *                 less than @len if there were partial characters
 678  *                 at the end of the input. If the error
 679  *                 #G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value
 680  *                 stored will the byte offset after the last valid
 681  *                 input sequence.
 682  * @bytes_written: the number of bytes stored in the output buffer (not
 683  *                 including the terminating nul).
 684  * @error:         location to store the error occuring, or %NULL to ignore
 685  *                 errors. Any of the errors in #GConvertError may occur.
 686  *
 687  * Converts a string from one character set to another.
 688  *
 689  * Note that you should use g_iconv() for streaming
 690  * conversions<footnoteref linkend="streaming-state"/>.
 691  *
 692  * Return value: If the conversion was successful, a newly allocated
 693  *               nul-terminated string, which must be freed with
 694  *               g_free(). Otherwise %NULL and @error will be set.
 695  **/
 696 gchar*
 697 g_convert (const gchar *str,
 698            gssize       len,
 699            const gchar *to_codeset,
 700            const gchar *from_codeset,
 701            gsize       *bytes_read,
 702            gsize       *bytes_written,
 703            GError     **error)
 704 {
 705   gchar *res;
 706   GIConv cd;
 707
 708   g_return_val_if_fail (str != NULL, NULL);
 709   g_return_val_if_fail (to_codeset != NULL, NULL);
 710   g_return_val_if_fail (from_codeset != NULL, NULL);
 711
 712   cd = open_converter (to_codeset, from_codeset, error);
 713
 714   if (cd == (GIConv) -1)
 715     {
 716       if (bytes_read)
 717         *bytes_read = 0;
 718
 719       if (bytes_written)
 720         *bytes_written = 0;
 721
 722       return NULL;
 723     }
 724
 725   res = g_convert_with_iconv (str, len, cd,
 726                               bytes_read, bytes_written,
 727                               error);
 728
 729   close_converter (cd);
 730
 731   return res;
 732 }
 733
 734 /**
 735  * g_convert_with_fallback:
 736  * @str:          the string to convert
 737  * @len:          the length of the string, or -1 if the string is
 738  *                nul-terminated<footnoteref linkend="nul-unsafe"/>.
 739  * @to_codeset:   name of character set into which to convert @str
 740  * @from_codeset: character set of @str.
 741  * @fallback:     UTF-8 string to use in place of character not
 742  *                present in the target encoding. (The string must be
 743  *                representable in the target encoding).
 744                   If %NULL, characters not in the target encoding will
 745                   be represented as Unicode escapes \uxxxx or \Uxxxxyyyy.
 746  * @bytes_read:   location to store the number of bytes in the
 747  *                input string that were successfully converted, or %NULL.
 748  *                Even if the conversion was successful, this may be
 749  *                less than @len if there were partial characters
 750  *                at the end of the input.
 751  * @bytes_written: the number of bytes stored in the output buffer (not
 752  *                including the terminating nul).
 753  * @error:        location to store the error occuring, or %NULL to ignore
 754  *                errors. Any of the errors in #GConvertError may occur.
 755  *
 756  * Converts a string from one character set to another, possibly
 757  * including fallback sequences for characters not representable
 758  * in the output. Note that it is not guaranteed that the specification
 759  * for the fallback sequences in @fallback will be honored. Some
 760  * systems may do a approximate conversion from @from_codeset
 761  * to @to_codeset in their iconv() functions,
 762  * in which case GLib will simply return that approximate conversion.
 763  *
 764  * Note that you should use g_iconv() for streaming
 765  * conversions<footnoteref linkend="streaming-state"/>.
 766  *
 767  * Return value: If the conversion was successful, a newly allocated
 768  *               nul-terminated string, which must be freed with
 769  *               g_free(). Otherwise %NULL and @error will be set.
 770  **/
 771 gchar*
 772 g_convert_with_fallback (const gchar *str,
 773                          gssize       len,
 774                          const gchar *to_codeset,
 775                          const gchar *from_codeset,
 776                          gchar       *fallback,
 777                          gsize       *bytes_read,
 778                          gsize       *bytes_written,
 779                          GError     **error)
 780 {
 781   gchar *utf8;
 782   gchar *dest;
 783   gchar *outp;
 784   const gchar *insert_str = NULL;
 785   const gchar *p;
 786   gsize inbytes_remaining;
 787   const gchar *save_p = NULL;
 788   gsize save_inbytes = 0;
 789   gsize outbytes_remaining;
 790   gsize err;
 791   GIConv cd;
 792   gsize outbuf_size;
 793   gboolean have_error = FALSE;
 794   gboolean done = FALSE;
 795
 796   GError *local_error = NULL;
 797
 798   g_return_val_if_fail (str != NULL, NULL);
 799   g_return_val_if_fail (to_codeset != NULL, NULL);
 800   g_return_val_if_fail (from_codeset != NULL, NULL);
 801
 802   if (len < 0)
 803     len = strlen (str);
 804
 805   /* Try an exact conversion; we only proceed if this fails
 806    * due to an illegal sequence in the input string.
 807    */
 808   dest = g_convert (str, len, to_codeset, from_codeset,
 809                     bytes_read, bytes_written, &local_error);
 810   if (!local_error)
 811     return dest;
 812
 813   if (!g_error_matches (local_error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE))
 814     {
 815       g_propagate_error (error, local_error);
 816       return NULL;
 817     }
 818   else
 819     g_error_free (local_error);
 820
 821   local_error = NULL;
 822
 823   /* No go; to proceed, we need a converter from "UTF-8" to
 824    * to_codeset, and the string as UTF-8.
 825    */
 826   cd = open_converter (to_codeset, "UTF-8", error);
 827   if (cd == (GIConv) -1)
 828     {
 829       if (bytes_read)
 830         *bytes_read = 0;
 831
 832       if (bytes_written)
 833         *bytes_written = 0;
 834
 835       return NULL;
 836     }
 837
 838   utf8 = g_convert (str, len, "UTF-8", from_codeset,
 839                     bytes_read, &inbytes_remaining, error);
 840   if (!utf8)
 841     {
 842       close_converter (cd);
 843       if (bytes_written)
 844         *bytes_written = 0;
 845       return NULL;
 846     }
 847
 848   /* Now the heart of the code. We loop through the UTF-8 string, and
 849    * whenever we hit an offending character, we form fallback, convert
 850    * the fallback to the target codeset, and then go back to
 851    * converting the original string after finishing with the fallback.
 852    *
 853    * The variables save_p and save_inbytes store the input state
 854    * for the original string while we are converting the fallback
 855    */
 856   p = utf8;
 857
 858   outbuf_size = len + 1; /* + 1 for nul in case len == 1 */
 859   outbytes_remaining = outbuf_size - 1; /* -1 for nul */
 860   outp = dest = g_malloc (outbuf_size);
 861
 862   while (!done && !have_error)
 863     {
 864       size_t inbytes_tmp = inbytes_remaining;
 865       err = g_iconv (cd, (char **)&p, &inbytes_tmp, &outp, &outbytes_remaining);
 866       inbytes_remaining = inbytes_tmp;
 867
 868       if (err == (size_t) -1)
 869         {
 870           switch (errno)
 871             {
 872             case EINVAL:
 873               g_assert_not_reached();
 874               break;
 875             case E2BIG:
 876               {
 877                 size_t used = outp - dest;
 878
 879                 outbuf_size *= 2;
 880                 dest = g_realloc (dest, outbuf_size);
 881
 882                 outp = dest + used;
 883                 outbytes_remaining = outbuf_size - used - 1; /* -1 for nul */
 884
 885                 break;
 886               }
 887             case EILSEQ:
 888               if (save_p)
 889                 {
 890                   /* Error converting fallback string - fatal
 891                    */
 892                   g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
 893                                _("Cannot convert fallback '%s' to codeset '%s'"),
 894                                insert_str, to_codeset);
 895                   have_error = TRUE;
 896                   break;
 897                 }
 898               else if (p)
 899                 {
 900                   if (!fallback)
 901                     {
 902                       gunichar ch = g_utf8_get_char (p);
 903                       insert_str = g_strdup_printf (ch < 0x10000 ? "\\u%04x" : "\\U%08x",
 904                                                     ch);
 905                     }
 906                   else
 907                     insert_str = fallback;
 908
 909                   save_p = g_utf8_next_char (p);
 910                   save_inbytes = inbytes_remaining - (save_p - p);
 911                   p = insert_str;
 912                   inbytes_remaining = strlen (p);
 913                   break;
 914                 }
 915               /* fall thru if p is NULL */
 916             default:
 917               g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_FAILED,
 918                            _("Error during conversion: %s"),
 919                            g_strerror (errno));
 920               have_error = TRUE;
 921               break;
 922             }
 923         }
 924       else
 925         {
 926           if (save_p)
 927             {
 928               if (!fallback)
 929                 g_free ((gchar *)insert_str);
 930               p = save_p;
 931               inbytes_remaining = save_inbytes;
 932               save_p = NULL;
 933             }
 934           else if (p)
 935             {
 936               /* call g_iconv with NULL inbuf to cleanup shift state */
 937               p = NULL;
 938               inbytes_remaining = 0;
 939             }
 940           else
 941             done = TRUE;
 942         }
 943     }
 944
 945   /* Cleanup
 946    */
 947   *outp = '\0';
 948
 949   close_converter (cd);
 950
 951   if (bytes_written)
 952     *bytes_written = outp - dest;       /* Doesn't include '\0' */
 953
 954   g_free (utf8);
 955
 956   if (have_error)
 957     {
 958       if (save_p && !fallback)
 959         g_free ((gchar *)insert_str);
 960       g_free (dest);
 961       return NULL;
 962     }
 963   else
 964     return dest;
 965 }
 966
 967 /*
 968  * g_locale_to_utf8
 969  *
 970  *
 971  */
 972
 973 static gchar *
 974 strdup_len (const gchar *string,
 975             gssize       len,
 976             gsize       *bytes_written,
 977             gsize       *bytes_read,
 978             GError      **error)
 979
 980 {
 981   gsize real_len;
 982
 983   if (!g_utf8_validate (string, len, NULL))
 984     {
 985       if (bytes_read)
 986         *bytes_read = 0;
 987       if (bytes_written)
 988         *bytes_written = 0;
 989
 990       g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
 991                    _("Invalid byte sequence in conversion input"));
 992       return NULL;
 993     }
 994
 995   if (len < 0)
 996     real_len = strlen (string);
 997   else
 998     {
 999       real_len = 0;
1000
1001       while (real_len < len && string[real_len])
1002         real_len++;
1003     }
1004
1005   if (bytes_read)
1006     *bytes_read = real_len;
1007   if (bytes_written)
1008     *bytes_written = real_len;
1009
1010   return g_strndup (string, real_len);
1011 }
1012
1013 /**
1014  * g_locale_to_utf8:
1015  * @opsysstring:   a string in the encoding of the current locale. On Windows
1016  *                 this means the system codepage.
1017  * @len:           the length of the string, or -1 if the string is
1018  *                 nul-terminated<footnoteref linkend="nul-unsafe"/>.
1019  * @bytes_read:    location to store the number of bytes in the
1020  *                 input string that were successfully converted, or %NULL.
1021  *                 Even if the conversion was successful, this may be
1022  *                 less than @len if there were partial characters
1023  *                 at the end of the input. If the error
1024  *                 #G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value
1025  *                 stored will the byte offset after the last valid
1026  *                 input sequence.
1027  * @bytes_written: the number of bytes stored in the output buffer (not
1028  *                 including the terminating nul).
1029  * @error:         location to store the error occuring, or %NULL to ignore
1030  *                 errors. Any of the errors in #GConvertError may occur.
1031  *
1032  * Converts a string which is in the encoding used for strings by
1033  * the C runtime (usually the same as that used by the operating
1034  * system) in the current locale into a UTF-8 string.
1035  *
1036  * Return value: The converted string, or %NULL on an error.
1037  **/
1038 gchar *
1039 g_locale_to_utf8 (const gchar  *opsysstring,
1040                   gssize        len,
1041                   gsize        *bytes_read,
1042                   gsize        *bytes_written,
1043                   GError      **error)
1044 {
1045   const char *charset;
1046
1047   if (g_get_charset (&charset))
1048     return strdup_len (opsysstring, len, bytes_read, bytes_written, error);
1049   else
1050     return g_convert (opsysstring, len,
1051                       "UTF-8", charset, bytes_read, bytes_written, error);
1052 }
1053
1054 /**
1055  * g_locale_from_utf8:
1056  * @utf8string:    a UTF-8 encoded string
1057  * @len:           the length of the string, or -1 if the string is
1058  *                 nul-terminated<footnoteref linkend="nul-unsafe"/>.
1059  * @bytes_read:    location to store the number of bytes in the
1060  *                 input string that were successfully converted, or %NULL.
1061  *                 Even if the conversion was successful, this may be
1062  *                 less than @len if there were partial characters
1063  *                 at the end of the input. If the error
1064  *                 #G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value
1065  *                 stored will the byte offset after the last valid
1066  *                 input sequence.
1067  * @bytes_written: the number of bytes stored in the output buffer (not
1068  *                 including the terminating nul).
1069  * @error:         location to store the error occuring, or %NULL to ignore
1070  *                 errors. Any of the errors in #GConvertError may occur.
1071  *
1072  * Converts a string from UTF-8 to the encoding used for strings by
1073  * the C runtime (usually the same as that used by the operating
1074  * system) in the current locale.
1075  *
1076  * Return value: The converted string, or %NULL on an error.
1077  **/
1078 gchar *
1079 g_locale_from_utf8 (const gchar *utf8string,
1080                     gssize       len,
1081                     gsize       *bytes_read,
1082                     gsize       *bytes_written,
1083                     GError     **error)
1084 {
1085   const gchar *charset;
1086
1087   if (g_get_charset (&charset))
1088     return strdup_len (utf8string, len, bytes_read, bytes_written, error);
1089   else
1090     return g_convert (utf8string, len,
1091                       charset, "UTF-8", bytes_read, bytes_written, error);
1092 }
1093
1094 #ifndef G_PLATFORM_WIN32
1095
1096 typedef struct _GFilenameCharsetCache GFilenameCharsetCache;
1097
1098 struct _GFilenameCharsetCache {
1099   gboolean is_utf8;
1100   gchar *charset;
1101   gchar **filename_charsets;
1102 };
1103
1104 static void
1105 filename_charset_cache_free (gpointer data)
1106 {
1107   GFilenameCharsetCache *cache = data;
1108   g_free (cache->charset);
1109   g_strfreev (cache->filename_charsets);
1110   g_free (cache);
1111 }
1112
1113 /**
1114  * g_get_filename_charsets:
1115  * @charsets: return location for the %NULL-terminated list of encoding names
1116  *
1117  * Determines the preferred character sets used for filenames.
1118  * The first character set from the @charsets is the filename encoding, the
1119  * subsequent character sets are used when trying to generate a displayable
1120  * representation of a filename, see g_filename_display_name().
1121  *
1122  * On Unix, the character sets are determined by consulting the
1123  * environment variables <envar>G_FILENAME_ENCODING</envar> and
1124  * <envar>G_BROKEN_FILENAMES</envar>. On Windows, the character set
1125  * used in the GLib API is always UTF-8 and said environment variables
1126  * have no effect.
1127  *
1128  * <envar>G_FILENAME_ENCODING</envar> may be set to a comma-separated list
1129  * of character set names. The special token "@locale" is taken to mean the
1130  * character set for the current locale. If <envar>G_FILENAME_ENCODING</envar>
1131  * is not set, but <envar>G_BROKEN_FILENAMES</envar> is, the character set of
1132  * the current locale is taken as the filename encoding. If neither environment
1133  * variable is set, UTF-8 is taken as the filename encoding, but the character
1134  * set of the current locale is also put in the list of encodings.
1135  *
1136  * The returned @charsets belong to GLib and must not be freed.
1137  *
1138  * Note that on Unix, regardless of the locale character set or
1139  * <envar>G_FILENAME_ENCODING</envar> value, the actual file names present on a
1140  * system might be in any random encoding or just gibberish.
1141  *
1142  * Return value: %TRUE if the filename encoding is UTF-8.
1143  *
1144  * Since: 2.6
1145  */
1146 gboolean
1147 g_get_filename_charsets (G_CONST_RETURN gchar ***filename_charsets)
1148 {
1149   static GStaticPrivate cache_private = G_STATIC_PRIVATE_INIT;
1150   GFilenameCharsetCache *cache = g_static_private_get (&cache_private);
1151   const gchar *charset;
1152
1153   if (!cache)
1154     {
1155       cache = g_new0 (GFilenameCharsetCache, 1);
1156       g_static_private_set (&cache_private, cache, filename_charset_cache_free);
1157     }
1158
1159   g_get_charset (&charset);
1160
1161   if (!(cache->charset && strcmp (cache->charset, charset) == 0))
1162     {
1163       const gchar *new_charset;
1164       gchar *p;
1165       gint i;
1166
1167       g_free (cache->charset);
1168       g_strfreev (cache->filename_charsets);
1169       cache->charset = g_strdup (charset);
1170
1171       p = getenv ("G_FILENAME_ENCODING");
1172       if (p != NULL && p[0] != '\0')
1173         {
1174           cache->filename_charsets = g_strsplit (p, ",", 0);
1175           cache->is_utf8 = (strcmp (cache->filename_charsets[0], "UTF-8") == 0);
1176
1177           for (i = 0; cache->filename_charsets[i]; i++)
1178             {
1179               if (strcmp ("@locale", cache->filename_charsets[i]) == 0)
1180                 {
1181                   g_get_charset (&new_charset);
1182                   g_free (cache->filename_charsets[i]);
1183                   cache->filename_charsets[i] = g_strdup (new_charset);
1184                 }
1185             }
1186         }
1187       else if (getenv ("G_BROKEN_FILENAMES") != NULL)
1188         {
1189           cache->filename_charsets = g_new0 (gchar *, 2);
1190           cache->is_utf8 = g_get_charset (&new_charset);
1191           cache->filename_charsets[0] = g_strdup (new_charset);
1192         }
1193       else
1194         {
1195           cache->filename_charsets = g_new0 (gchar *, 3);
1196           cache->is_utf8 = TRUE;
1197           cache->filename_charsets[0] = g_strdup ("UTF-8");
1198           if (!g_get_charset (&new_charset))
1199             cache->filename_charsets[1] = g_strdup (new_charset);
1200         }
1201     }
1202
1203   if (filename_charsets)
1204     *filename_charsets = (const gchar **)cache->filename_charsets;
1205
1206   return cache->is_utf8;
1207 }
1208
1209 #else /* G_PLATFORM_WIN32 */
1210
1211 gboolean
1212 g_get_filename_charsets (G_CONST_RETURN gchar ***filename_charsets)
1213 {
1214   static const gchar *charsets[] = {
1215     "UTF-8",
1216     NULL
1217   };
1218
1219 #ifdef G_OS_WIN32
1220   /* On Windows GLib pretends that the filename charset is UTF-8 */
1221   if (filename_charsets)
1222     *filename_charsets = charsets;
1223
1224   return TRUE;
1225 #else
1226   gboolean result;
1227
1228   /* Cygwin works like before */
1229   result = g_get_charset (&(charsets[0]));
1230
1231   if (filename_charsets)
1232     *filename_charsets = charsets;
1233
1234   return result;
1235 #endif
1236 }
1237
1238 #endif /* G_PLATFORM_WIN32 */
1239
1240 static gboolean
1241 get_filename_charset (const gchar **filename_charset)
1242 {
1243   const gchar **charsets;
1244   gboolean is_utf8;
1245
1246   is_utf8 = g_get_filename_charsets (&charsets);
1247
1248   if (filename_charset)
1249     *filename_charset = charsets[0];
1250
1251   return is_utf8;
1252 }
1253
1254 /* This is called from g_thread_init(). It's used to
1255  * initialize some static data in a threadsafe way.
1256  */
1257 void
1258 _g_convert_thread_init (void)
1259 {
1260   const gchar **dummy;
1261   (void) g_get_filename_charsets (&dummy);
1262 }
1263
1264 /**
1265  * g_filename_to_utf8:
1266  * @opsysstring:   a string in the encoding for filenames
1267  * @len:           the length of the string, or -1 if the string is
1268  *                 nul-terminated<footnoteref linkend="nul-unsafe"/>.
1269  * @bytes_read:    location to store the number of bytes in the
1270  *                 input string that were successfully converted, or %NULL.
1271  *                 Even if the conversion was successful, this may be
1272  *                 less than @len if there were partial characters
1273  *                 at the end of the input. If the error
1274  *                 #G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value
1275  *                 stored will the byte offset after the last valid
1276  *                 input sequence.
1277  * @bytes_written: the number of bytes stored in the output buffer (not
1278  *                 including the terminating nul).
1279  * @error:         location to store the error occuring, or %NULL to ignore
1280  *                 errors. Any of the errors in #GConvertError may occur.
1281  *
1282  * Converts a string which is in the encoding used by GLib for
1283  * filenames into a UTF-8 string. Note that on Windows GLib uses UTF-8
1284  * for filenames.
1285  *
1286  * Return value: The converted string, or %NULL on an error.
1287  **/
1288 gchar*
1289 g_filename_to_utf8 (const gchar *opsysstring,
1290                     gssize       len,
1291                     gsize       *bytes_read,
1292                     gsize       *bytes_written,
1293                     GError     **error)
1294 {
1295   const gchar *charset;
1296
1297   if (get_filename_charset (&charset))
1298     return strdup_len (opsysstring, len, bytes_read, bytes_written, error);
1299   else
1300     return g_convert (opsysstring, len,
1301                       "UTF-8", charset, bytes_read, bytes_written, error);
1302 }
1303
1304 #ifdef G_OS_WIN32
1305
1306 #undef g_filename_to_utf8
1307
1308 /* Binary compatibility version. Not for newly compiled code. */
1309
1310 gchar*
1311 g_filename_to_utf8 (const gchar *opsysstring,
1312                     gssize       len,
1313                     gsize       *bytes_read,
1314                     gsize       *bytes_written,
1315                     GError     **error)
1316 {
1317   const gchar *charset;
1318
1319   if (g_get_charset (&charset))
1320     return strdup_len (opsysstring, len, bytes_read, bytes_written, error);
1321   else
1322     return g_convert (opsysstring, len,
1323                       "UTF-8", charset, bytes_read, bytes_written, error);
1324 }
1325
1326 #endif
1327
1328 /**
1329  * g_filename_from_utf8:
1330  * @utf8string:    a UTF-8 encoded string.
1331  * @len:           the length of the string, or -1 if the string is
1332  *                 nul-terminated.
1333  * @bytes_read:    location to store the number of bytes in the
1334  *                 input string that were successfully converted, or %NULL.
1335  *                 Even if the conversion was successful, this may be
1336  *                 less than @len if there were partial characters
1337  *                 at the end of the input. If the error
1338  *                 #G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value
1339  *                 stored will the byte offset after the last valid
1340  *                 input sequence.
1341  * @bytes_written: the number of bytes stored in the output buffer (not
1342  *                 including the terminating nul).
1343  * @error:         location to store the error occuring, or %NULL to ignore
1344  *                 errors. Any of the errors in #GConvertError may occur.
1345  *
1346  * Converts a string from UTF-8 to the encoding GLib uses for
1347  * filenames. Note that on Windows GLib uses UTF-8 for filenames.
1348  *
1349  * Return value: The converted string, or %NULL on an error.
1350  **/
1351 gchar*
1352 g_filename_from_utf8 (const gchar *utf8string,
1353                       gssize       len,
1354                       gsize       *bytes_read,
1355                       gsize       *bytes_written,
1356                       GError     **error)
1357 {
1358   const gchar *charset;
1359
1360   if (get_filename_charset (&charset))
1361     return strdup_len (utf8string, len, bytes_read, bytes_written, error);
1362   else
1363     return g_convert (utf8string, len,
1364                       charset, "UTF-8", bytes_read, bytes_written, error);
1365 }
1366
1367 #ifdef G_OS_WIN32
1368
1369 #undef g_filename_from_utf8
1370
1371 /* Binary compatibility version. Not for newly compiled code. */
1372
1373 gchar*
1374 g_filename_from_utf8 (const gchar *utf8string,
1375                       gssize       len,
1376                       gsize       *bytes_read,
1377                       gsize       *bytes_written,
1378                       GError     **error)
1379 {
1380   const gchar *charset;
1381
1382   if (g_get_charset (&charset))
1383     return strdup_len (utf8string, len, bytes_read, bytes_written, error);
1384   else
1385     return g_convert (utf8string, len,
1386                       charset, "UTF-8", bytes_read, bytes_written, error);
1387 }
1388
1389 #endif
1390
1391 /* Test of haystack has the needle prefix, comparing case
1392  * insensitive. haystack may be UTF-8, but needle must
1393  * contain only ascii. */
1394 static gboolean
1395 has_case_prefix (const gchar *haystack, const gchar *needle)
1396 {
1397   const gchar *h, *n;
1398
1399   /* Eat one character at a time. */
1400   h = haystack;
1401   n = needle;
1402
1403   while (*n && *h &&
1404          g_ascii_tolower (*n) == g_ascii_tolower (*h))
1405     {
1406       n++;
1407       h++;
1408     }
1409
1410   return *n == '\0';
1411 }
1412
1413 typedef enum {
1414   UNSAFE_ALL        = 0x1,  /* Escape all unsafe characters   */
1415   UNSAFE_ALLOW_PLUS = 0x2,  /* Allows '+'  */
1416   UNSAFE_PATH       = 0x8,  /* Allows '/', '&', '=', ':', '@', '+', '$' and ',' */
1417   UNSAFE_HOST       = 0x10, /* Allows '/' and ':' and '@' */
1418   UNSAFE_SLASHES    = 0x20  /* Allows all characters except for '/' and '%' */
1419 } UnsafeCharacterSet;
1420
1421 static const guchar acceptable[96] = {
1422   /* A table of the ASCII chars from space (32) to DEL (127) */
1423   /*      !    "    #    $    %    &    '    (    )    *    +    ,    -    .    / */
1424   0x00,0x3F,0x20,0x20,0x28,0x00,0x2C,0x3F,0x3F,0x3F,0x3F,0x2A,0x28,0x3F,0x3F,0x1C,
1425   /* 0    1    2    3    4    5    6    7    8    9    :    ;    <    =    >    ? */
1426   0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x38,0x20,0x20,0x2C,0x20,0x20,
1427   /* @    A    B    C    D    E    F    G    H    I    J    K    L    M    N    O */
1428   0x38,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,
1429   /* P    Q    R    S    T    U    V    W    X    Y    Z    [    \    ]    ^    _ */
1430   0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x20,0x20,0x20,0x20,0x3F,
1431   /* `    a    b    c    d    e    f    g    h    i    j    k    l    m    n    o */
1432   0x20,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,
1433   /* p    q    r    s    t    u    v    w    x    y    z    {    |    }    ~  DEL */
1434   0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x20,0x20,0x20,0x3F,0x20
1435 };
1436
1437 static const gchar hex[16] = "0123456789ABCDEF";
1438
1439 /* Note: This escape function works on file: URIs, but if you want to
1440  * escape something else, please read RFC-2396 */
1441 static gchar *
1442 g_escape_uri_string (const gchar *string,
1443                      UnsafeCharacterSet mask)
1444 {
1445 #define ACCEPTABLE(a) ((a)>=32 && (a)<128 && (acceptable[(a)-32] & use_mask))
1446
1447   const gchar *p;
1448   gchar *q;
1449   gchar *result;
1450   int c;
1451   gint unacceptable;
1452   UnsafeCharacterSet use_mask;
1453
1454   g_return_val_if_fail (mask == UNSAFE_ALL
1455                         || mask == UNSAFE_ALLOW_PLUS
1456                         || mask == UNSAFE_PATH
1457                         || mask == UNSAFE_HOST
1458                         || mask == UNSAFE_SLASHES, NULL);
1459
1460   unacceptable = 0;
1461   use_mask = mask;
1462   for (p = string; *p != '\0'; p++)
1463     {
1464       c = (guchar) *p;
1465       if (!ACCEPTABLE (c))
1466         unacceptable++;
1467     }
1468
1469   result = g_malloc (p - string + unacceptable * 2 + 1);
1470
1471   use_mask = mask;
1472   for (q = result, p = string; *p != '\0'; p++)
1473     {
1474       c = (guchar) *p;
1475
1476       if (!ACCEPTABLE (c))
1477         {
1478           *q++ = '%'; /* means hex coming */
1479           *q++ = hex[c >> 4];
1480           *q++ = hex[c & 15];
1481         }
1482       else
1483         *q++ = *p;
1484     }
1485
1486   *q = '\0';
1487
1488   return result;
1489 }
1490
1491
1492 static gchar *
1493 g_escape_file_uri (const gchar *hostname,
1494                    const gchar *pathname)
1495 {
1496   char *escaped_hostname = NULL;
1497   char *escaped_path;
1498   char *res;
1499
1500 #ifdef G_OS_WIN32
1501   char *p, *backslash;
1502
1503   /* Turn backslashes into forward slashes. That's what Netscape
1504    * does, and they are actually more or less equivalent in Windows.
1505    */
1506
1507   pathname = g_strdup (pathname);
1508   p = (char *) pathname;
1509
1510   while ((backslash = strchr (p, '\\')) != NULL)
1511     {
1512       *backslash = '/';
1513       p = backslash + 1;
1514     }
1515 #endif
1516
1517   if (hostname && *hostname != '\0')
1518     {
1519       escaped_hostname = g_escape_uri_string (hostname, UNSAFE_HOST);
1520     }
1521
1522   escaped_path = g_escape_uri_string (pathname, UNSAFE_PATH);
1523
1524   res = g_strconcat ("file://",
1525                      (escaped_hostname) ? escaped_hostname : "",
1526                      (*escaped_path != '/') ? "/" : "",
1527                      escaped_path,
1528                      NULL);
1529
1530 #ifdef G_OS_WIN32
1531   g_free ((char *) pathname);
1532 #endif
1533
1534   g_free (escaped_hostname);
1535   g_free (escaped_path);
1536
1537   return res;
1538 }
1539
1540 static int
1541 unescape_character (const char *scanner)
1542 {
1543   int first_digit;
1544   int second_digit;
1545
1546   first_digit = g_ascii_xdigit_value (scanner[0]);
1547   if (first_digit < 0)
1548     return -1;
1549
1550   second_digit = g_ascii_xdigit_value (scanner[1]);
1551   if (second_digit < 0)
1552     return -1;
1553
1554   return (first_digit << 4) | second_digit;
1555 }
1556
1557 static gchar *
1558 g_unescape_uri_string (const char *escaped,
1559                        int         len,
1560                        const char *illegal_escaped_characters,
1561                        gboolean    ascii_must_not_be_escaped)
1562 {
1563   const gchar *in, *in_end;
1564   gchar *out, *result;
1565   int c;
1566
1567   if (escaped == NULL)
1568     return NULL;
1569
1570   if (len < 0)
1571     len = strlen (escaped);
1572
1573   result = g_malloc (len + 1);
1574
1575   out = result;
1576   for (in = escaped, in_end = escaped + len; in < in_end; in++)
1577     {
1578       c = *in;
1579
1580       if (c == '%')
1581         {
1582           /* catch partial escape sequences past the end of the substring */
1583           if (in + 3 > in_end)
1584             break;
1585
1586           c = unescape_character (in + 1);
1587
1588           /* catch bad escape sequences and NUL characters */
1589           if (c <= 0)
1590             break;
1591
1592           /* catch escaped ASCII */
1593           if (ascii_must_not_be_escaped && c <= 0x7F)
1594             break;
1595
1596           /* catch other illegal escaped characters */
1597           if (strchr (illegal_escaped_characters, c) != NULL)
1598             break;
1599
1600           in += 2;
1601         }
1602
1603       *out++ = c;
1604     }
1605
1606   g_assert (out - result <= len);
1607   *out = '\0';
1608
1609   if (in != in_end)
1610     {
1611       g_free (result);
1612       return NULL;
1613     }
1614
1615   return result;
1616 }
1617
1618 static gboolean
1619 is_asciialphanum (gunichar c)
1620 {
1621   return c <= 0x7F && g_ascii_isalnum (c);
1622 }
1623
1624 static gboolean
1625 is_asciialpha (gunichar c)
1626 {
1627   return c <= 0x7F && g_ascii_isalpha (c);
1628 }
1629
1630 /* allows an empty string */
1631 static gboolean
1632 hostname_validate (const char *hostname)
1633 {
1634   const char *p;
1635   gunichar c, first_char, last_char;
1636
1637   p = hostname;
1638   if (*p == '\0')
1639     return TRUE;
1640   do
1641     {
1642       /* read in a label */
1643       c = g_utf8_get_char (p);
1644       p = g_utf8_next_char (p);
1645       if (!is_asciialphanum (c))
1646         return FALSE;
1647       first_char = c;
1648       do
1649         {
1650           last_char = c;
1651           c = g_utf8_get_char (p);
1652           p = g_utf8_next_char (p);
1653         }
1654       while (is_asciialphanum (c) || c == '-');
1655       if (last_char == '-')
1656         return FALSE;
1657
1658       /* if that was the last label, check that it was a toplabel */
1659       if (c == '\0' || (c == '.' && *p == '\0'))
1660         return is_asciialpha (first_char);
1661     }
1662   while (c == '.');
1663   return FALSE;
1664 }
1665
1666 /**
1667  * g_filename_from_uri:
1668  * @uri: a uri describing a filename (escaped, encoded in ASCII).
1669  * @hostname: Location to store hostname for the URI, or %NULL.
1670  *            If there is no hostname in the URI, %NULL will be
1671  *            stored in this location.
1672  * @error: location to store the error occuring, or %NULL to ignore
1673  *         errors. Any of the errors in #GConvertError may occur.
1674  *
1675  * Converts an escaped ASCII-encoded URI to a local filename in the
1676  * encoding used for filenames.
1677  *
1678  * Return value: a newly-allocated string holding the resulting
1679  *               filename, or %NULL on an error.
1680  **/
1681 gchar *
1682 g_filename_from_uri (const gchar *uri,
1683                      gchar      **hostname,
1684                      GError     **error)
1685 {
1686   const char *path_part;
1687   const char *host_part;
1688   char *unescaped_hostname;
1689   char *result;
1690   char *filename;
1691   int offs;
1692 #ifdef G_OS_WIN32
1693   char *p, *slash;
1694 #endif
1695
1696   if (hostname)
1697     *hostname = NULL;
1698
1699   if (!has_case_prefix (uri, "file:/"))
1700     {
1701       g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_BAD_URI,
1702                    _("The URI '%s' is not an absolute URI using the \"file\" scheme"),
1703                    uri);
1704       return NULL;
1705     }
1706
1707   path_part = uri + strlen ("file:");
1708
1709   if (strchr (path_part, '#') != NULL)
1710     {
1711       g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_BAD_URI,
1712                    _("The local file URI '%s' may not include a '#'"),
1713                    uri);
1714       return NULL;
1715     }
1716
1717   if (has_case_prefix (path_part, "///"))
1718     path_part += 2;
1719   else if (has_case_prefix (path_part, "//"))
1720     {
1721       path_part += 2;
1722       host_part = path_part;
1723
1724       path_part = strchr (path_part, '/');
1725
1726       if (path_part == NULL)
1727         {
1728           g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_BAD_URI,
1729                        _("The URI '%s' is invalid"),
1730                        uri);
1731           return NULL;
1732         }
1733
1734       unescaped_hostname = g_unescape_uri_string (host_part, path_part - host_part, "", TRUE);
1735
1736       if (unescaped_hostname == NULL ||
1737           !hostname_validate (unescaped_hostname))
1738         {
1739           g_free (unescaped_hostname);
1740           g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_BAD_URI,
1741                        _("The hostname of the URI '%s' is invalid"),
1742                        uri);
1743           return NULL;
1744         }
1745
1746       if (hostname)
1747         *hostname = unescaped_hostname;
1748       else
1749         g_free (unescaped_hostname);
1750     }
1751
1752   filename = g_unescape_uri_string (path_part, -1, "/", FALSE);
1753
1754   if (filename == NULL)
1755     {
1756       g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_BAD_URI,
1757                    _("The URI '%s' contains invalidly escaped characters"),
1758                    uri);
1759       return NULL;
1760     }
1761
1762   offs = 0;
1763 #ifdef G_OS_WIN32
1764   /* Drop localhost */
1765   if (hostname && *hostname != NULL &&
1766       g_ascii_strcasecmp (*hostname, "localhost") == 0)
1767     {
1768       g_free (*hostname);
1769       *hostname = NULL;
1770     }
1771
1772   /* Turn slashes into backslashes, because that's the canonical spelling */
1773   p = filename;
1774   while ((slash = strchr (p, '/')) != NULL)
1775     {
1776       *slash = '\\';
1777       p = slash + 1;
1778     }
1779
1780   /* Windows URIs with a drive letter can be like "file://host/c:/foo"
1781    * or "file://host/c|/foo" (some Netscape versions). In those cases, start
1782    * the filename from the drive letter.
1783    */
1784   if (g_ascii_isalpha (filename[1]))
1785     {
1786       if (filename[2] == ':')
1787         offs = 1;
1788       else if (filename[2] == '|')
1789         {
1790           filename[2] = ':';
1791           offs = 1;
1792         }
1793     }
1794 #endif
1795
1796   result = g_strdup (filename + offs);
1797   g_free (filename);
1798
1799   return result;
1800 }
1801
1802 #ifdef G_OS_WIN32
1803
1804 #undef g_filename_from_uri
1805
1806 gchar *
1807 g_filename_from_uri (const gchar *uri,
1808                      gchar      **hostname,
1809                      GError     **error)
1810 {
1811   gchar *utf8_filename;
1812   gchar *retval = NULL;
1813
1814   utf8_filename = g_filename_from_uri_utf8 (uri, hostname, error);
1815   if (utf8_filename)
1816     {
1817       retval = g_locale_from_utf8 (utf8_filename, -1, NULL, NULL, error);
1818       g_free (utf8_filename);
1819     }
1820   return retval;
1821 }
1822
1823 #endif
1824
1825 /**
1826  * g_filename_to_uri:
1827  * @filename: an absolute filename specified in the GLib file name encoding,
1828  *            which is the on-disk file name bytes on Unix, and UTF-8 on
1829  *            Windows
1830  * @hostname: A UTF-8 encoded hostname, or %NULL for none.
1831  * @error: location to store the error occuring, or %NULL to ignore
1832  *         errors. Any of the errors in #GConvertError may occur.
1833  *
1834  * Converts an absolute filename to an escaped ASCII-encoded URI, with the path
1835  * component following Section 3.3. of RFC 2396.
1836  *
1837  * Return value: a newly-allocated string holding the resulting
1838  *               URI, or %NULL on an error.
1839  **/
1840 gchar *
1841 g_filename_to_uri (const gchar *filename,
1842                    const gchar *hostname,
1843                    GError     **error)
1844 {
1845   char *escaped_uri;
1846
1847   g_return_val_if_fail (filename != NULL, NULL);
1848
1849   if (!g_path_is_absolute (filename))
1850     {
1851       g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_NOT_ABSOLUTE_PATH,
1852                    _("The pathname '%s' is not an absolute path"),
1853                    filename);
1854       return NULL;
1855     }
1856
1857   if (hostname &&
1858       !(g_utf8_validate (hostname, -1, NULL)
1859         && hostname_validate (hostname)))
1860     {
1861       g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1862                    _("Invalid hostname"));
1863       return NULL;
1864     }
1865
1866 #ifdef G_OS_WIN32
1867   /* Don't use localhost unnecessarily */
1868   if (hostname && g_ascii_strcasecmp (hostname, "localhost") == 0)
1869     hostname = NULL;
1870 #endif
1871
1872   escaped_uri = g_escape_file_uri (hostname, filename);
1873
1874   return escaped_uri;
1875 }
1876
1877 #ifdef G_OS_WIN32
1878
1879 #undef g_filename_to_uri
1880
1881 gchar *
1882 g_filename_to_uri (const gchar *filename,
1883                    const gchar *hostname,
1884                    GError     **error)
1885 {
1886   gchar *utf8_filename;
1887   gchar *retval = NULL;
1888
1889   utf8_filename = g_locale_to_utf8 (filename, -1, NULL, NULL, error);
1890
1891   if (utf8_filename)
1892     {
1893       retval = g_filename_to_uri_utf8 (utf8_filename, hostname, error);
1894       g_free (utf8_filename);
1895     }
1896
1897   return retval;
1898 }
1899
1900 #endif
1901
1902 /**
1903  * g_uri_list_extract_uris:
1904  * @uri_list: an URI list
1905  *
1906  * Splits an URI list conforming to the text/uri-list
1907  * mime type defined in RFC 2483 into individual URIs,
1908  * discarding any comments. The URIs are not validated.
1909  *
1910  * Returns: a newly allocated %NULL-terminated list of
1911  *   strings holding the individual URIs. The array should
1912  *   be freed with g_strfreev().
1913  *
1914  * Since: 2.6
1915  */
1916 gchar **
1917 g_uri_list_extract_uris (const gchar *uri_list)
1918 {
1919   GSList *uris, *u;
1920   const gchar *p, *q;
1921   gchar **result;
1922   gint n_uris = 0;
1923
1924   uris = NULL;
1925
1926   p = uri_list;
1927
1928   /* We don't actually try to validate the URI according to RFC
1929    * 2396, or even check for allowed characters - we just ignore
1930    * comments and trim whitespace off the ends.  We also
1931    * allow LF delimination as well as the specified CRLF.
1932    *
1933    * We do allow comments like specified in RFC 2483.
1934    */
1935   while (p)
1936     {
1937       if (*p != '#')
1938         {
1939           while (g_ascii_isspace (*p))
1940             p++;
1941
1942           q = p;
1943           while (*q && (*q != '\n') && (*q != '\r'))
1944             q++;
1945
1946           if (q > p)
1947             {
1948               q--;
1949               while (q > p && g_ascii_isspace (*q))
1950                 q--;
1951
1952               if (q > p)
1953                 {
1954                   uris = g_slist_prepend (uris, g_strndup (p, q - p + 1));
1955                   n_uris++;
1956                 }
1957             }
1958         }
1959       p = strchr (p, '\n');
1960       if (p)
1961         p++;
1962     }
1963
1964   result = g_new (gchar *, n_uris + 1);
1965
1966   result[n_uris--] = NULL;
1967   for (u = uris; u; u = u->next)
1968     result[n_uris--] = u->data;
1969
1970   g_slist_free (uris);
1971
1972   return result;
1973 }
1974
1975 static gchar *
1976 make_valid_utf8 (const gchar *name)
1977 {
1978   GString *string;
1979   const gchar *remainder, *invalid;
1980   gint remaining_bytes, valid_bytes;
1981
1982   string = NULL;
1983   remainder = name;
1984   remaining_bytes = strlen (name);
1985
1986   while (remaining_bytes != 0)
1987     {
1988       if (g_utf8_validate (remainder, remaining_bytes, &invalid))
1989         break;
1990       valid_bytes = invalid - remainder;
1991
1992       if (string == NULL)
1993         string = g_string_sized_new (remaining_bytes);
1994
1995       g_string_append_len (string, remainder, valid_bytes);
1996       /* append U+FFFD REPLACEMENT CHARACTER */
1997       g_string_append (string, "\357\277\275");
1998
1999       remaining_bytes -= valid_bytes + 1;
2000       remainder = invalid + 1;
2001     }
2002
2003   if (string == NULL)
2004     return g_strdup (name);
2005
2006   g_string_append (string, remainder);
2007
2008   g_assert (g_utf8_validate (string->str, -1, NULL));
2009
2010   return g_string_free (string, FALSE);
2011 }
2012
2013 /**
2014  * g_filename_display_basename:
2015  * @filename: an absolute pathname in the GLib file name encoding
2016  *
2017  * Returns the display basename for the particular filename, guaranteed
2018  * to be valid UTF-8. The display name might not be identical to the filename,
2019  * for instance there might be problems converting it to UTF-8, and some files
2020  * can be translated in the display.
2021  *
2022  * If GLib can not make sense of the encoding of @filename, as a last resort it
2023  * replaces unknown characters with U+FFFD, the Unicode replacement character.
2024  * You can search the result for the UTF-8 encoding of this character (which is
2025  * "\357\277\275" in octal notation) to find out if @filename was in an invalid
2026  * encoding.
2027  *
2028  * You must pass the whole absolute pathname to this functions so that
2029  * translation of well known locations can be done.
2030  *
2031  * This function is preferred over g_filename_display_name() if you know the
2032  * whole path, as it allows translation.
2033  *
2034  * Return value: a newly allocated string containing
2035  *   a rendition of the basename of the filename in valid UTF-8
2036  *
2037  * Since: 2.6
2038  **/
2039 gchar *
2040 g_filename_display_basename (const gchar *filename)
2041 {
2042   char *basename;
2043   char *display_name;
2044
2045   g_return_val_if_fail (filename != NULL, NULL);
2046
2047   basename = g_path_get_basename (filename);
2048   display_name = g_filename_display_name (basename);
2049   g_free (basename);
2050   return display_name;
2051 }
2052
2053 /**
2054  * g_filename_display_name:
2055  * @filename: a pathname hopefully in the GLib file name encoding
2056  *
2057  * Converts a filename into a valid UTF-8 string. The conversion is
2058  * not necessarily reversible, so you should keep the original around
2059  * and use the return value of this function only for display purposes.
2060  * Unlike g_filename_to_utf8(), the result is guaranteed to be non-%NULL
2061  * even if the filename actually isn't in the GLib file name encoding.
2062  *
2063  * If GLib can not make sense of the encoding of @filename, as a last resort it
2064  * replaces unknown characters with U+FFFD, the Unicode replacement character.
2065  * You can search the result for the UTF-8 encoding of this character (which is
2066  * "\357\277\275" in octal notation) to find out if @filename was in an invalid
2067  * encoding.
2068  *
2069  * If you know the whole pathname of the file you should use
2070  * g_filename_display_basename(), since that allows location-based
2071  * translation of filenames.
2072  *
2073  * Return value: a newly allocated string containing
2074  *   a rendition of the filename in valid UTF-8
2075  *
2076  * Since: 2.6
2077  **/
2078 gchar *
2079 g_filename_display_name (const gchar *filename)
2080 {
2081   gint i;
2082   const gchar **charsets;
2083   gchar *display_name = NULL;
2084   gboolean is_utf8;
2085
2086   is_utf8 = g_get_filename_charsets (&charsets);
2087
2088   if (is_utf8)
2089     {
2090       if (g_utf8_validate (filename, -1, NULL))
2091         display_name = g_strdup (filename);
2092     }
2093
2094   if (!display_name)
2095     {
2096       /* Try to convert from the filename charsets to UTF-8.
2097        * Skip the first charset if it is UTF-8.
2098        */
2099       for (i = is_utf8 ? 1 : 0; charsets[i]; i++)
2100         {
2101           display_name = g_convert (filename, -1, "UTF-8", charsets[i],
2102                                     NULL, NULL, NULL);
2103
2104           if (display_name)
2105             break;
2106         }
2107     }
2108
2109   /* if all conversions failed, we replace invalid UTF-8
2110    * by a question mark
2111    */
2112   if (!display_name)
2113     display_name = make_valid_utf8 (filename);
2114
2115   return display_name;
2116 }
2117
2118 #define __G_CONVERT_C__
2119 #include "galiasdef.c"