glib/gconvert.c

   1 /* GLIB - Library of useful routines for C programming
   2  *
   3  * gconvert.c: Convert between character sets using iconv
   4  * Copyright Red Hat Inc., 2000
   5  * Authors: Havoc Pennington <hp@redhat.com>, Owen Taylor <otaylor@redhat.com
   6  *
   7  * This library is free software; you can redistribute it and/or
   8  * modify it under the terms of the GNU Lesser General Public
   9  * License as published by the Free Software Foundation; either
  10  * version 2 of the License, or (at your option) any later version.
  11  *
  12  * This library is distributed in the hope that it will be useful,
  13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15  * Lesser General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU Lesser General Public
  18  * License along with this library; if not, write to the
  19  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  20  * Boston, MA 02111-1307, USA.
  21  */
  22
  23 #include <config.h>
  24
  25 #include <iconv.h>
  26 #include <errno.h>
  27 #include <stdio.h>
  28 #include <string.h>
  29 #include <stdlib.h>
  30
  31 #include "glib.h"
  32
  33 #ifdef G_PLATFORM_WIN32
  34 #define STRICT
  35 #include <windows.h>
  36 #undef STRICT
  37 #endif
  38
  39 #include "glibintl.h"
  40
  41 #if defined(USE_LIBICONV_GNU) && !defined (_LIBICONV_H)
  42 #error GNU libiconv in use but included iconv.h not from libiconv
  43 #endif
  44 #if !defined(USE_LIBICONV_GNU) && defined (_LIBICONV_H)
  45 #error GNU libiconv not in use but included iconv.h is from libiconv
  46 #endif
  47
  48 GQuark
  49 g_convert_error_quark (void)
  50 {
  51   static GQuark quark;
  52   if (!quark)
  53     quark = g_quark_from_static_string ("g_convert_error");
  54
  55   return quark;
  56 }
  57
  58 static gboolean
  59 try_conversion (const char *to_codeset,
  60                 const char *from_codeset,
  61                 iconv_t    *cd)
  62 {
  63   *cd = iconv_open (to_codeset, from_codeset);
  64
  65   if (*cd == (iconv_t)-1 && errno == EINVAL)
  66     return FALSE;
  67   else
  68     return TRUE;
  69 }
  70
  71 static gboolean
  72 try_to_aliases (const char **to_aliases,
  73                 const char  *from_codeset,
  74                 iconv_t     *cd)
  75 {
  76   if (to_aliases)
  77     {
  78       const char **p = to_aliases;
  79       while (*p)
  80         {
  81           if (try_conversion (*p, from_codeset, cd))
  82             return TRUE;
  83
  84           p++;
  85         }
  86     }
  87
  88   return FALSE;
  89 }
  90
  91 extern const char **_g_charset_get_aliases (const char *canonical_name);
  92
  93 /**
  94  * g_iconv_open:
  95  * @to_codeset: destination codeset
  96  * @from_codeset: source codeset
  97  *
  98  * Same as the standard UNIX routine <function>iconv_open()</function>, but
  99  * may be implemented via libiconv on UNIX flavors that lack
 100  * a native implementation.
 101  *
 102  * GLib provides g_convert() and g_locale_to_utf8() which are likely
 103  * more convenient than the raw iconv wrappers.
 104  *
 105  * Return value: a "conversion descriptor", or (GIConv)-1 if
 106  *  opening the converter failed.
 107  **/
 108 GIConv
 109 g_iconv_open (const gchar  *to_codeset,
 110               const gchar  *from_codeset)
 111 {
 112   iconv_t cd;
 113
 114   if (!try_conversion (to_codeset, from_codeset, &cd))
 115     {
 116       const char **to_aliases = _g_charset_get_aliases (to_codeset);
 117       const char **from_aliases = _g_charset_get_aliases (from_codeset);
 118
 119       if (from_aliases)
 120         {
 121           const char **p = from_aliases;
 122           while (*p)
 123             {
 124               if (try_conversion (to_codeset, *p, &cd))
 125                 goto out;
 126
 127               if (try_to_aliases (to_aliases, *p, &cd))
 128                 goto out;
 129
 130               p++;
 131             }
 132         }
 133
 134       if (try_to_aliases (to_aliases, from_codeset, &cd))
 135         goto out;
 136     }
 137
 138   return (cd == (iconv_t)-1) ? (GIConv)-1 : (GIConv)cd;
 139 }
 140
 141 /**
 142  * g_iconv:
 143  * @converter: conversion descriptor from g_iconv_open()
 144  * @inbuf: bytes to convert
 145  * @inbytes_left: inout parameter, bytes remaining to convert in @inbuf
 146  * @outbuf: converted output bytes
 147  * @outbytes_left: inout parameter, bytes available to fill in @outbuf
 148  *
 149  * Same as the standard UNIX routine <function>iconv()</function>, but
 150  * may be implemented via libiconv on UNIX flavors that lack
 151  * a native implementation.
 152  *
 153  * GLib provides g_convert() and g_locale_to_utf8() which are likely
 154  * more convenient than the raw iconv wrappers.
 155  *
 156  * Return value: count of non-reversible conversions, or -1 on error
 157  **/
 158 size_t
 159 g_iconv (GIConv   converter,
 160          gchar  **inbuf,
 161          gsize   *inbytes_left,
 162          gchar  **outbuf,
 163          gsize   *outbytes_left)
 164 {
 165   iconv_t cd = (iconv_t)converter;
 166
 167   return iconv (cd, inbuf, inbytes_left, outbuf, outbytes_left);
 168 }
 169
 170 /**
 171  * g_iconv_close:
 172  * @converter: a conversion descriptor from g_iconv_open()
 173  *
 174  * Same as the standard UNIX routine <function>iconv_close()</function>, but
 175  * may be implemented via libiconv on UNIX flavors that lack
 176  * a native implementation. Should be called to clean up
 177  * the conversion descriptor from g_iconv_open() when
 178  * you are done converting things.
 179  *
 180  * GLib provides g_convert() and g_locale_to_utf8() which are likely
 181  * more convenient than the raw iconv wrappers.
 182  *
 183  * Return value: -1 on error, 0 on success
 184  **/
 185 gint
 186 g_iconv_close (GIConv converter)
 187 {
 188   iconv_t cd = (iconv_t)converter;
 189
 190   return iconv_close (cd);
 191 }
 192
 193
 194 #define ICONV_CACHE_SIZE   (16)
 195
 196 struct _iconv_cache_bucket {
 197   gchar *key;
 198   guint32 refcount;
 199   gboolean used;
 200   GIConv cd;
 201 };
 202
 203 static GList *iconv_cache_list;
 204 static GHashTable *iconv_cache;
 205 static GHashTable *iconv_open_hash;
 206 static guint iconv_cache_size = 0;
 207 G_LOCK_DEFINE_STATIC (iconv_cache_lock);
 208
 209 /* caller *must* hold the iconv_cache_lock */
 210 static void
 211 iconv_cache_init (void)
 212 {
 213   static gboolean initialized = FALSE;
 214
 215   if (initialized)
 216     return;
 217
 218   iconv_cache_list = NULL;
 219   iconv_cache = g_hash_table_new (g_str_hash, g_str_equal);
 220   iconv_open_hash = g_hash_table_new (g_direct_hash, g_direct_equal);
 221
 222   initialized = TRUE;
 223 }
 224
 225
 226 /**
 227  * iconv_cache_bucket_new:
 228  * @key: cache key
 229  * @cd: iconv descriptor
 230  *
 231  * Creates a new cache bucket, inserts it into the cache and
 232  * increments the cache size.
 233  *
 234  * Returns a pointer to the newly allocated cache bucket.
 235  **/
 236 struct _iconv_cache_bucket *
 237 iconv_cache_bucket_new (const gchar *key, GIConv cd)
 238 {
 239   struct _iconv_cache_bucket *bucket;
 240
 241   bucket = g_new (struct _iconv_cache_bucket, 1);
 242   bucket->key = g_strdup (key);
 243   bucket->refcount = 1;
 244   bucket->used = TRUE;
 245   bucket->cd = cd;
 246
 247   g_hash_table_insert (iconv_cache, bucket->key, bucket);
 248
 249   /* FIXME: if we sorted the list so items with few refcounts were
 250      first, then we could expire them faster in iconv_cache_expire_unused () */
 251   iconv_cache_list = g_list_prepend (iconv_cache_list, bucket);
 252
 253   iconv_cache_size++;
 254
 255   return bucket;
 256 }
 257
 258
 259 /**
 260  * iconv_cache_bucket_expire:
 261  * @node: cache bucket's node
 262  * @bucket: cache bucket
 263  *
 264  * Expires a single cache bucket @bucket. This should only ever be
 265  * called on a bucket that currently has no used iconv descriptors
 266  * open.
 267  *
 268  * @node is not a required argument. If @node is not supplied, we
 269  * search for it ourselves.
 270  **/
 271 static void
 272 iconv_cache_bucket_expire (GList *node, struct _iconv_cache_bucket *bucket)
 273 {
 274   g_hash_table_remove (iconv_cache, bucket->key);
 275
 276   if (node == NULL)
 277     node = g_list_find (iconv_cache_list, bucket);
 278
 279   g_assert (node != NULL);
 280
 281   if (node->prev)
 282     {
 283       node->prev->next = node->next;
 284       if (node->next)
 285         node->next->prev = node->prev;
 286     }
 287   else
 288     {
 289       iconv_cache_list = node->next;
 290       if (node->next)
 291         node->next->prev = NULL;
 292     }
 293
 294   g_list_free_1 (node);
 295
 296   g_free (bucket->key);
 297   g_iconv_close (bucket->cd);
 298   g_free (bucket);
 299
 300   iconv_cache_size--;
 301 }
 302
 303
 304 /**
 305  * iconv_cache_expire_unused:
 306  *
 307  * Expires as many unused cache buckets as it needs to in order to get
 308  * the total number of buckets < ICONV_CACHE_SIZE.
 309  **/
 310 static void
 311 iconv_cache_expire_unused (void)
 312 {
 313   struct _iconv_cache_bucket *bucket;
 314   GList *node, *next;
 315
 316   node = iconv_cache_list;
 317   while (node && iconv_cache_size >= ICONV_CACHE_SIZE)
 318     {
 319       next = node->next;
 320
 321       bucket = node->data;
 322       if (bucket->refcount == 0)
 323         iconv_cache_bucket_expire (node, bucket);
 324
 325       node = next;
 326     }
 327 }
 328
 329 static GIConv
 330 open_converter (const gchar *to_codeset,
 331                 const gchar *from_codeset,
 332                 GError     **error)
 333 {
 334   struct _iconv_cache_bucket *bucket;
 335   gchar *key;
 336   GIConv cd;
 337
 338   /* create our key */
 339   key = g_alloca (strlen (from_codeset) + strlen (to_codeset) + 2);
 340   sprintf (key, "%s:%s", from_codeset, to_codeset);
 341
 342   G_LOCK (iconv_cache_lock);
 343
 344   /* make sure the cache has been initialized */
 345   iconv_cache_init ();
 346
 347   bucket = g_hash_table_lookup (iconv_cache, key);
 348   if (bucket)
 349     {
 350       if (bucket->used)
 351         {
 352           cd = g_iconv_open (to_codeset, from_codeset);
 353           if (cd == (GIConv) -1)
 354             goto error;
 355         }
 356       else
 357         {
 358           /* Apparently iconv on Solaris <= 7 segfaults if you pass in
 359            * NULL for anything but inbuf; work around that. (NULL outbuf
 360            * or NULL *outbuf is allowed by Unix98.)
 361            */
 362           gsize inbytes_left = 0;
 363           gchar *outbuf = NULL;
 364           gsize outbytes_left = 0;
 365
 366           cd = bucket->cd;
 367           bucket->used = TRUE;
 368
 369           /* reset the descriptor */
 370           g_iconv (cd, NULL, &inbytes_left, &outbuf, &outbytes_left);
 371         }
 372
 373       bucket->refcount++;
 374     }
 375   else
 376     {
 377       cd = g_iconv_open (to_codeset, from_codeset);
 378       if (cd == (GIConv) -1)
 379         goto error;
 380
 381       iconv_cache_expire_unused ();
 382
 383       bucket = iconv_cache_bucket_new (key, cd);
 384     }
 385
 386   g_hash_table_insert (iconv_open_hash, cd, bucket->key);
 387
 388   G_UNLOCK (iconv_cache_lock);
 389
 390   return cd;
 391
 392  error:
 393
 394   G_UNLOCK (iconv_cache_lock);
 395
 396   /* Something went wrong.  */
 397   if (errno == EINVAL)
 398     g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_NO_CONVERSION,
 399                  _("Conversion from character set '%s' to '%s' is not supported"),
 400                  from_codeset, to_codeset);
 401   else
 402     g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_FAILED,
 403                  _("Could not open converter from '%s' to '%s': %s"),
 404                  from_codeset, to_codeset, g_strerror (errno));
 405
 406   return cd;
 407 }
 408
 409 static int
 410 close_converter (GIConv converter)
 411 {
 412   struct _iconv_cache_bucket *bucket;
 413   const gchar *key;
 414   GIConv cd;
 415
 416   cd = converter;
 417
 418   if (cd == (GIConv) -1)
 419     return 0;
 420
 421   G_LOCK (iconv_cache_lock);
 422
 423   key = g_hash_table_lookup (iconv_open_hash, cd);
 424   if (key)
 425     {
 426       g_hash_table_remove (iconv_open_hash, cd);
 427
 428       bucket = g_hash_table_lookup (iconv_cache, key);
 429       g_assert (bucket);
 430
 431       bucket->refcount--;
 432
 433       if (cd == bucket->cd)
 434         bucket->used = FALSE;
 435       else
 436         g_iconv_close (cd);
 437
 438       if (!bucket->refcount && iconv_cache_size > ICONV_CACHE_SIZE)
 439         {
 440           /* expire this cache bucket */
 441           iconv_cache_bucket_expire (NULL, bucket);
 442         }
 443     }
 444   else
 445     {
 446       G_UNLOCK (iconv_cache_lock);
 447
 448       g_warning ("This iconv context wasn't opened using open_converter");
 449
 450       return g_iconv_close (converter);
 451     }
 452
 453   G_UNLOCK (iconv_cache_lock);
 454
 455   return 0;
 456 }
 457
 458
 459 /**
 460  * g_convert:
 461  * @str:           the string to convert
 462  * @len:           the length of the string
 463  * @to_codeset:    name of character set into which to convert @str
 464  * @from_codeset:  character set of @str.
 465  * @bytes_read:    location to store the number of bytes in the
 466  *                 input string that were successfully converted, or %NULL.
 467  *                 Even if the conversion was successful, this may be
 468  *                 less than @len if there were partial characters
 469  *                 at the end of the input. If the error
 470  *                 #G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value
 471  *                 stored will the byte offset after the last valid
 472  *                 input sequence.
 473  * @bytes_written: the number of bytes stored in the output buffer (not
 474  *                 including the terminating nul).
 475  * @error:         location to store the error occuring, or %NULL to ignore
 476  *                 errors. Any of the errors in #GConvertError may occur.
 477  *
 478  * Converts a string from one character set to another.
 479  *
 480  * Return value: If the conversion was successful, a newly allocated
 481  *               nul-terminated string, which must be freed with
 482  *               g_free(). Otherwise %NULL and @error will be set.
 483  **/
 484 gchar*
 485 g_convert (const gchar *str,
 486            gssize       len,
 487            const gchar *to_codeset,
 488            const gchar *from_codeset,
 489            gsize       *bytes_read,
 490            gsize       *bytes_written,
 491            GError     **error)
 492 {
 493   gchar *res;
 494   GIConv cd;
 495
 496   g_return_val_if_fail (str != NULL, NULL);
 497   g_return_val_if_fail (to_codeset != NULL, NULL);
 498   g_return_val_if_fail (from_codeset != NULL, NULL);
 499
 500   cd = open_converter (to_codeset, from_codeset, error);
 501
 502   if (cd == (GIConv) -1)
 503     {
 504       if (bytes_read)
 505         *bytes_read = 0;
 506
 507       if (bytes_written)
 508         *bytes_written = 0;
 509
 510       return NULL;
 511     }
 512
 513   res = g_convert_with_iconv (str, len, cd,
 514                               bytes_read, bytes_written,
 515                               error);
 516
 517   close_converter (cd);
 518
 519   return res;
 520 }
 521
 522 /**
 523  * g_convert_with_iconv:
 524  * @str:           the string to convert
 525  * @len:           the length of the string
 526  * @converter:     conversion descriptor from g_iconv_open()
 527  * @bytes_read:    location to store the number of bytes in the
 528  *                 input string that were successfully converted, or %NULL.
 529  *                 Even if the conversion was successful, this may be
 530  *                 less than @len if there were partial characters
 531  *                 at the end of the input. If the error
 532  *                 #G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value
 533  *                 stored will the byte offset after the last valid
 534  *                 input sequence.
 535  * @bytes_written: the number of bytes stored in the output buffer (not
 536  *                 including the terminating nul).
 537  * @error:         location to store the error occuring, or %NULL to ignore
 538  *                 errors. Any of the errors in #GConvertError may occur.
 539  *
 540  * Converts a string from one character set to another.
 541  *
 542  * Return value: If the conversion was successful, a newly allocated
 543  *               nul-terminated string, which must be freed with
 544  *               g_free(). Otherwise %NULL and @error will be set.
 545  **/
 546 gchar*
 547 g_convert_with_iconv (const gchar *str,
 548                       gssize       len,
 549                       GIConv       converter,
 550                       gsize       *bytes_read,
 551                       gsize       *bytes_written,
 552                       GError     **error)
 553 {
 554   gchar *dest;
 555   gchar *outp;
 556   const gchar *p;
 557   gsize inbytes_remaining;
 558   gsize outbytes_remaining;
 559   gsize err;
 560   gsize outbuf_size;
 561   gboolean have_error = FALSE;
 562
 563   g_return_val_if_fail (str != NULL, NULL);
 564   g_return_val_if_fail (converter != (GIConv) -1, NULL);
 565
 566   if (len < 0)
 567     len = strlen (str);
 568
 569   p = str;
 570   inbytes_remaining = len;
 571   outbuf_size = len + 1; /* + 1 for nul in case len == 1 */
 572
 573   outbytes_remaining = outbuf_size - 1; /* -1 for nul */
 574   outp = dest = g_malloc (outbuf_size);
 575
 576  again:
 577
 578   err = g_iconv (converter, (char **)&p, &inbytes_remaining, &outp, &outbytes_remaining);
 579
 580   if (err == (size_t) -1)
 581     {
 582       switch (errno)
 583         {
 584         case EINVAL:
 585           /* Incomplete text, do not report an error */
 586           break;
 587         case E2BIG:
 588           {
 589             size_t used = outp - dest;
 590
 591             outbuf_size *= 2;
 592             dest = g_realloc (dest, outbuf_size);
 593
 594             outp = dest + used;
 595             outbytes_remaining = outbuf_size - used - 1; /* -1 for nul */
 596
 597             goto again;
 598           }
 599         case EILSEQ:
 600           g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
 601                        _("Invalid byte sequence in conversion input"));
 602           have_error = TRUE;
 603           break;
 604         default:
 605           g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_FAILED,
 606                        _("Error during conversion: %s"),
 607                        g_strerror (errno));
 608           have_error = TRUE;
 609           break;
 610         }
 611     }
 612
 613   *outp = '\0';
 614
 615   if (bytes_read)
 616     *bytes_read = p - str;
 617   else
 618     {
 619       if ((p - str) != len)
 620         {
 621           if (!have_error)
 622             {
 623               g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT,
 624                            _("Partial character sequence at end of input"));
 625               have_error = TRUE;
 626             }
 627         }
 628     }
 629
 630   if (bytes_written)
 631     *bytes_written = outp - dest;       /* Doesn't include '\0' */
 632
 633   if (have_error)
 634     {
 635       g_free (dest);
 636       return NULL;
 637     }
 638   else
 639     return dest;
 640 }
 641
 642 /**
 643  * g_convert_with_fallback:
 644  * @str:          the string to convert
 645  * @len:          the length of the string
 646  * @to_codeset:   name of character set into which to convert @str
 647  * @from_codeset: character set of @str.
 648  * @fallback:     UTF-8 string to use in place of character not
 649  *                present in the target encoding. (This must be
 650  *                in the target encoding), if %NULL, characters
 651  *                not in the target encoding will be represented
 652  *                as Unicode escapes \x{XXXX} or \x{XXXXXX}.
 653  * @bytes_read:   location to store the number of bytes in the
 654  *                input string that were successfully converted, or %NULL.
 655  *                Even if the conversion was successful, this may be
 656  *                less than @len if there were partial characters
 657  *                at the end of the input.
 658  * @bytes_written: the number of bytes stored in the output buffer (not
 659  *                including the terminating nul).
 660  * @error:        location to store the error occuring, or %NULL to ignore
 661  *                errors. Any of the errors in #GConvertError may occur.
 662  *
 663  * Converts a string from one character set to another, possibly
 664  * including fallback sequences for characters not representable
 665  * in the output. Note that it is not guaranteed that the specification
 666  * for the fallback sequences in @fallback will be honored. Some
 667  * systems may do a approximate conversion from @from_codeset
 668  * to @to_codeset in their <function>iconv()</function> functions,
 669  * in which case GLib will simply return that approximate conversion.
 670  *
 671  * Return value: If the conversion was successful, a newly allocated
 672  *               nul-terminated string, which must be freed with
 673  *               g_free(). Otherwise %NULL and @error will be set.
 674  **/
 675 gchar*
 676 g_convert_with_fallback (const gchar *str,
 677                          gssize       len,
 678                          const gchar *to_codeset,
 679                          const gchar *from_codeset,
 680                          gchar       *fallback,
 681                          gsize       *bytes_read,
 682                          gsize       *bytes_written,
 683                          GError     **error)
 684 {
 685   gchar *utf8;
 686   gchar *dest;
 687   gchar *outp;
 688   const gchar *insert_str = NULL;
 689   const gchar *p;
 690   gsize inbytes_remaining;
 691   const gchar *save_p = NULL;
 692   gsize save_inbytes = 0;
 693   gsize outbytes_remaining;
 694   gsize err;
 695   GIConv cd;
 696   gsize outbuf_size;
 697   gboolean have_error = FALSE;
 698   gboolean done = FALSE;
 699
 700   GError *local_error = NULL;
 701
 702   g_return_val_if_fail (str != NULL, NULL);
 703   g_return_val_if_fail (to_codeset != NULL, NULL);
 704   g_return_val_if_fail (from_codeset != NULL, NULL);
 705
 706   if (len < 0)
 707     len = strlen (str);
 708
 709   /* Try an exact conversion; we only proceed if this fails
 710    * due to an illegal sequence in the input string.
 711    */
 712   dest = g_convert (str, len, to_codeset, from_codeset,
 713                     bytes_read, bytes_written, &local_error);
 714   if (!local_error)
 715     return dest;
 716
 717   if (!g_error_matches (local_error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE))
 718     {
 719       g_propagate_error (error, local_error);
 720       return NULL;
 721     }
 722   else
 723     g_error_free (local_error);
 724
 725   local_error = NULL;
 726
 727   /* No go; to proceed, we need a converter from "UTF-8" to
 728    * to_codeset, and the string as UTF-8.
 729    */
 730   cd = open_converter (to_codeset, "UTF-8", error);
 731   if (cd == (GIConv) -1)
 732     {
 733       if (bytes_read)
 734         *bytes_read = 0;
 735
 736       if (bytes_written)
 737         *bytes_written = 0;
 738
 739       return NULL;
 740     }
 741
 742   utf8 = g_convert (str, len, "UTF-8", from_codeset,
 743                     bytes_read, &inbytes_remaining, error);
 744   if (!utf8)
 745     {
 746       close_converter (cd);
 747       if (bytes_written)
 748         *bytes_written = 0;
 749       return NULL;
 750     }
 751
 752   /* Now the heart of the code. We loop through the UTF-8 string, and
 753    * whenever we hit an offending character, we form fallback, convert
 754    * the fallback to the target codeset, and then go back to
 755    * converting the original string after finishing with the fallback.
 756    *
 757    * The variables save_p and save_inbytes store the input state
 758    * for the original string while we are converting the fallback
 759    */
 760   p = utf8;
 761
 762   outbuf_size = len + 1; /* + 1 for nul in case len == 1 */
 763   outbytes_remaining = outbuf_size - 1; /* -1 for nul */
 764   outp = dest = g_malloc (outbuf_size);
 765
 766   while (!done && !have_error)
 767     {
 768       size_t inbytes_tmp = inbytes_remaining;
 769       err = g_iconv (cd, (char **)&p, &inbytes_tmp, &outp, &outbytes_remaining);
 770       inbytes_remaining = inbytes_tmp;
 771
 772       if (err == (size_t) -1)
 773         {
 774           switch (errno)
 775             {
 776             case EINVAL:
 777               g_assert_not_reached();
 778               break;
 779             case E2BIG:
 780               {
 781                 size_t used = outp - dest;
 782
 783                 outbuf_size *= 2;
 784                 dest = g_realloc (dest, outbuf_size);
 785
 786                 outp = dest + used;
 787                 outbytes_remaining = outbuf_size - used - 1; /* -1 for nul */
 788
 789                 break;
 790               }
 791             case EILSEQ:
 792               if (save_p)
 793                 {
 794                   /* Error converting fallback string - fatal
 795                    */
 796                   g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
 797                                _("Cannot convert fallback '%s' to codeset '%s'"),
 798                                insert_str, to_codeset);
 799                   have_error = TRUE;
 800                   break;
 801                 }
 802               else
 803                 {
 804                   if (!fallback)
 805                     {
 806                       gunichar ch = g_utf8_get_char (p);
 807                       insert_str = g_strdup_printf ("\\x{%0*X}",
 808                                                     (ch < 0x10000) ? 4 : 6,
 809                                                     ch);
 810                     }
 811                   else
 812                     insert_str = fallback;
 813
 814                   save_p = g_utf8_next_char (p);
 815                   save_inbytes = inbytes_remaining - (save_p - p);
 816                   p = insert_str;
 817                   inbytes_remaining = strlen (p);
 818                 }
 819               break;
 820             default:
 821               g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_FAILED,
 822                            _("Error during conversion: %s"),
 823                            g_strerror (errno));
 824               have_error = TRUE;
 825               break;
 826             }
 827         }
 828       else
 829         {
 830           if (save_p)
 831             {
 832               if (!fallback)
 833                 g_free ((gchar *)insert_str);
 834               p = save_p;
 835               inbytes_remaining = save_inbytes;
 836               save_p = NULL;
 837             }
 838           else
 839             done = TRUE;
 840         }
 841     }
 842
 843   /* Cleanup
 844    */
 845   *outp = '\0';
 846
 847   close_converter (cd);
 848
 849   if (bytes_written)
 850     *bytes_written = outp - dest;       /* Doesn't include '\0' */
 851
 852   g_free (utf8);
 853
 854   if (have_error)
 855     {
 856       if (save_p && !fallback)
 857         g_free ((gchar *)insert_str);
 858       g_free (dest);
 859       return NULL;
 860     }
 861   else
 862     return dest;
 863 }
 864
 865 /*
 866  * g_locale_to_utf8
 867  *
 868  *
 869  */
 870
 871 #ifndef G_PLATFORM_WIN32
 872
 873 static gchar *
 874 strdup_len (const gchar *string,
 875             gssize       len,
 876             gsize       *bytes_written,
 877             gsize       *bytes_read,
 878             GError      **error)
 879
 880 {
 881   gsize real_len;
 882
 883   if (!g_utf8_validate (string, -1, NULL))
 884     {
 885       if (bytes_read)
 886         *bytes_read = 0;
 887       if (bytes_written)
 888         *bytes_written = 0;
 889
 890       g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
 891                    _("Invalid byte sequence in conversion input"));
 892       return NULL;
 893     }
 894
 895   if (len < 0)
 896     real_len = strlen (string);
 897   else
 898     {
 899       real_len = 0;
 900
 901       while (real_len < len && string[real_len])
 902         real_len++;
 903     }
 904
 905   if (bytes_read)
 906     *bytes_read = real_len;
 907   if (bytes_written)
 908     *bytes_written = real_len;
 909
 910   return g_strndup (string, real_len);
 911 }
 912
 913 #endif
 914
 915 /**
 916  * g_locale_to_utf8:
 917  * @opsysstring:   a string in the encoding of the current locale
 918  * @len:           the length of the string, or -1 if the string is
 919  *                 nul-terminated.
 920  * @bytes_read:    location to store the number of bytes in the
 921  *                 input string that were successfully converted, or %NULL.
 922  *                 Even if the conversion was successful, this may be
 923  *                 less than @len if there were partial characters
 924  *                 at the end of the input. If the error
 925  *                 #G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value
 926  *                 stored will the byte offset after the last valid
 927  *                 input sequence.
 928  * @bytes_written: the number of bytes stored in the output buffer (not
 929  *                 including the terminating nul).
 930  * @error:         location to store the error occuring, or %NULL to ignore
 931  *                 errors. Any of the errors in #GConvertError may occur.
 932  *
 933  * Converts a string which is in the encoding used for strings by
 934  * the C runtime (usually the same as that used by the operating
 935  * system) in the current locale into a UTF-8 string.
 936  *
 937  * Return value: The converted string, or %NULL on an error.
 938  **/
 939 gchar *
 940 g_locale_to_utf8 (const gchar  *opsysstring,
 941                   gssize        len,
 942                   gsize        *bytes_read,
 943                   gsize        *bytes_written,
 944                   GError      **error)
 945 {
 946 #ifdef G_PLATFORM_WIN32
 947
 948   gint i, clen, total_len, wclen, first;
 949   wchar_t *wcs, wc;
 950   gchar *result, *bp;
 951   const wchar_t *wcp;
 952
 953   if (len == -1)
 954     len = strlen (opsysstring);
 955
 956   wcs = g_new (wchar_t, len);
 957   wclen = MultiByteToWideChar (CP_ACP, 0, opsysstring, len, wcs, len);
 958
 959   wcp = wcs;
 960   total_len = 0;
 961   for (i = 0; i < wclen; i++)
 962     {
 963       wc = *wcp++;
 964
 965       if (wc < 0x80)
 966         total_len += 1;
 967       else if (wc < 0x800)
 968         total_len += 2;
 969       else if (wc < 0x10000)
 970         total_len += 3;
 971       else if (wc < 0x200000)
 972         total_len += 4;
 973       else if (wc < 0x4000000)
 974         total_len += 5;
 975       else
 976         total_len += 6;
 977     }
 978
 979   result = g_malloc (total_len + 1);
 980
 981   wcp = wcs;
 982   bp = result;
 983   for (i = 0; i < wclen; i++)
 984     {
 985       wc = *wcp++;
 986
 987       if (wc < 0x80)
 988         {
 989           first = 0;
 990           clen = 1;
 991         }
 992       else if (wc < 0x800)
 993         {
 994           first = 0xc0;
 995           clen = 2;
 996         }
 997       else if (wc < 0x10000)
 998         {
 999           first = 0xe0;
1000           clen = 3;
1001         }
1002       else if (wc < 0x200000)
1003         {
1004           first = 0xf0;
1005           clen = 4;
1006         }
1007       else if (wc < 0x4000000)
1008         {
1009           first = 0xf8;
1010           clen = 5;
1011         }
1012       else
1013         {
1014           first = 0xfc;
1015           clen = 6;
1016         }
1017
1018       /* Woo-hoo! */
1019       switch (clen)
1020         {
1021         case 6: bp[5] = (wc & 0x3f) | 0x80; wc >>= 6; /* Fall through */
1022         case 5: bp[4] = (wc & 0x3f) | 0x80; wc >>= 6; /* Fall through */
1023         case 4: bp[3] = (wc & 0x3f) | 0x80; wc >>= 6; /* Fall through */
1024         case 3: bp[2] = (wc & 0x3f) | 0x80; wc >>= 6; /* Fall through */
1025         case 2: bp[1] = (wc & 0x3f) | 0x80; wc >>= 6; /* Fall through */
1026         case 1: bp[0] = wc | first;
1027         }
1028
1029       bp += clen;
1030     }
1031   *bp = 0;
1032
1033   g_free (wcs);
1034
1035   if (bytes_read)
1036     *bytes_read = len;
1037   if (bytes_written)
1038     *bytes_written = total_len;
1039
1040   return result;
1041
1042 #else  /* !G_PLATFORM_WIN32 */
1043
1044   const char *charset;
1045
1046   if (g_get_charset (&charset))
1047     return strdup_len (opsysstring, len, bytes_read, bytes_written, error);
1048   else
1049     return g_convert (opsysstring, len,
1050                       "UTF-8", charset, bytes_read, bytes_written, error);
1051
1052 #endif /* !G_PLATFORM_WIN32 */
1053 }
1054
1055 /**
1056  * g_locale_from_utf8:
1057  * @utf8string:    a UTF-8 encoded string
1058  * @len:           the length of the string, or -1 if the string is
1059  *                 nul-terminated.
1060  * @bytes_read:    location to store the number of bytes in the
1061  *                 input string that were successfully converted, or %NULL.
1062  *                 Even if the conversion was successful, this may be
1063  *                 less than @len if there were partial characters
1064  *                 at the end of the input. If the error
1065  *                 #G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value
1066  *                 stored will the byte offset after the last valid
1067  *                 input sequence.
1068  * @bytes_written: the number of bytes stored in the output buffer (not
1069  *                 including the terminating nul).
1070  * @error:         location to store the error occuring, or %NULL to ignore
1071  *                 errors. Any of the errors in #GConvertError may occur.
1072  *
1073  * Converts a string from UTF-8 to the encoding used for strings by
1074  * the C runtime (usually the same as that used by the operating
1075  * system) in the current locale.
1076  *
1077  * Return value: The converted string, or %NULL on an error.
1078  **/
1079 gchar *
1080 g_locale_from_utf8 (const gchar *utf8string,
1081                     gssize       len,
1082                     gsize       *bytes_read,
1083                     gsize       *bytes_written,
1084                     GError     **error)
1085 {
1086 #ifdef G_PLATFORM_WIN32
1087
1088   gint i, mask, clen, mblen;
1089   wchar_t *wcs, *wcp;
1090   gchar *result;
1091   guchar *cp, *end, c;
1092   gint n;
1093
1094   if (len == -1)
1095     len = strlen (utf8string);
1096
1097   /* First convert to wide chars */
1098   cp = (guchar *) utf8string;
1099   end = cp + len;
1100   n = 0;
1101   wcs = g_new (wchar_t, len + 1);
1102   wcp = wcs;
1103   while (cp != end)
1104     {
1105       mask = 0;
1106       c = *cp;
1107
1108       if (c < 0x80)
1109         {
1110           clen = 1;
1111           mask = 0x7f;
1112         }
1113       else if ((c & 0xe0) == 0xc0)
1114         {
1115           clen = 2;
1116           mask = 0x1f;
1117         }
1118       else if ((c & 0xf0) == 0xe0)
1119         {
1120           clen = 3;
1121           mask = 0x0f;
1122         }
1123       else if ((c & 0xf8) == 0xf0)
1124         {
1125           clen = 4;
1126           mask = 0x07;
1127         }
1128       else if ((c & 0xfc) == 0xf8)
1129         {
1130           clen = 5;
1131           mask = 0x03;
1132         }
1133       else if ((c & 0xfc) == 0xfc)
1134         {
1135           clen = 6;
1136           mask = 0x01;
1137         }
1138       else
1139         {
1140           g_free (wcs);
1141           return NULL;
1142         }
1143
1144       if (cp + clen > end)
1145         {
1146           g_free (wcs);
1147           return NULL;
1148         }
1149
1150       *wcp = (cp[0] & mask);
1151       for (i = 1; i < clen; i++)
1152         {
1153           if ((cp[i] & 0xc0) != 0x80)
1154             {
1155               g_free (wcs);
1156               return NULL;
1157             }
1158           *wcp <<= 6;
1159           *wcp |= (cp[i] & 0x3f);
1160         }
1161
1162       cp += clen;
1163       wcp++;
1164       n++;
1165     }
1166   if (cp != end)
1167     {
1168       g_free (wcs);
1169       return NULL;
1170     }
1171
1172   /* n is the number of wide chars constructed */
1173
1174   /* Convert to a string in the current ANSI codepage */
1175
1176   result = g_new (gchar, 3 * n + 1);
1177   mblen = WideCharToMultiByte (CP_ACP, 0, wcs, n, result, 3*n, NULL, NULL);
1178   result[mblen] = 0;
1179   g_free (wcs);
1180
1181   if (bytes_read)
1182     *bytes_read = len;
1183   if (bytes_written)
1184     *bytes_written = mblen;
1185
1186   return result;
1187
1188 #else  /* !G_PLATFORM_WIN32 */
1189
1190   const gchar *charset;
1191
1192   if (g_get_charset (&charset))
1193     return strdup_len (utf8string, len, bytes_read, bytes_written, error);
1194   else
1195     return g_convert (utf8string, len,
1196                       charset, "UTF-8", bytes_read, bytes_written, error);
1197
1198 #endif /* !G_PLATFORM_WIN32 */
1199 }
1200
1201 #ifndef G_PLATFORM_WIN32
1202 static gboolean
1203 have_broken_filenames (void)
1204 {
1205   static gboolean initialized = FALSE;
1206   static gboolean broken;
1207
1208   if (initialized)
1209     return broken;
1210
1211   broken = (getenv ("G_BROKEN_FILENAMES") != NULL);
1212
1213   initialized = TRUE;
1214
1215   return broken;
1216 }
1217 #endif /* !G_PLATFORM_WIN32 */
1218
1219 /* This is called from g_thread_init(). It's used to
1220  * initialize some static data in a threadsafe way.
1221  */
1222 void
1223 g_convert_init (void)
1224 {
1225 #ifndef G_PLATFORM_WIN32
1226   (void)have_broken_filenames ();
1227 #endif /* !G_PLATFORM_WIN32 */
1228 }
1229
1230 /**
1231  * g_filename_to_utf8:
1232  * @opsysstring:   a string in the encoding for filenames
1233  * @len:           the length of the string, or -1 if the string is
1234  *                 nul-terminated.
1235  * @bytes_read:    location to store the number of bytes in the
1236  *                 input string that were successfully converted, or %NULL.
1237  *                 Even if the conversion was successful, this may be
1238  *                 less than @len if there were partial characters
1239  *                 at the end of the input. If the error
1240  *                 #G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value
1241  *                 stored will the byte offset after the last valid
1242  *                 input sequence.
1243  * @bytes_written: the number of bytes stored in the output buffer (not
1244  *                 including the terminating nul).
1245  * @error:         location to store the error occuring, or %NULL to ignore
1246  *                 errors. Any of the errors in #GConvertError may occur.
1247  *
1248  * Converts a string which is in the encoding used for filenames
1249  * into a UTF-8 string.
1250  *
1251  * Return value: The converted string, or %NULL on an error.
1252  **/
1253 gchar*
1254 g_filename_to_utf8 (const gchar *opsysstring,
1255                     gssize       len,
1256                     gsize       *bytes_read,
1257                     gsize       *bytes_written,
1258                     GError     **error)
1259 {
1260 #ifdef G_PLATFORM_WIN32
1261   return g_locale_to_utf8 (opsysstring, len,
1262                            bytes_read, bytes_written,
1263                            error);
1264 #else  /* !G_PLATFORM_WIN32 */
1265
1266   if (have_broken_filenames ())
1267     return g_locale_to_utf8 (opsysstring, len,
1268                              bytes_read, bytes_written,
1269                              error);
1270   else
1271     return strdup_len (opsysstring, len, bytes_read, bytes_written, error);
1272 #endif /* !G_PLATFORM_WIN32 */
1273 }
1274
1275 /**
1276  * g_filename_from_utf8:
1277  * @utf8string:    a UTF-8 encoded string.
1278  * @len:           the length of the string, or -1 if the string is
1279  *                 nul-terminated.
1280  * @bytes_read:    location to store the number of bytes in the
1281  *                 input string that were successfully converted, or %NULL.
1282  *                 Even if the conversion was successful, this may be
1283  *                 less than @len if there were partial characters
1284  *                 at the end of the input. If the error
1285  *                 #G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value
1286  *                 stored will the byte offset after the last valid
1287  *                 input sequence.
1288  * @bytes_written: the number of bytes stored in the output buffer (not
1289  *                 including the terminating nul).
1290  * @error:         location to store the error occuring, or %NULL to ignore
1291  *                 errors. Any of the errors in #GConvertError may occur.
1292  *
1293  * Converts a string from UTF-8 to the encoding used for filenames.
1294  *
1295  * Return value: The converted string, or %NULL on an error.
1296  **/
1297 gchar*
1298 g_filename_from_utf8 (const gchar *utf8string,
1299                       gssize       len,
1300                       gsize       *bytes_read,
1301                       gsize       *bytes_written,
1302                       GError     **error)
1303 {
1304 #ifdef G_PLATFORM_WIN32
1305   return g_locale_from_utf8 (utf8string, len,
1306                              bytes_read, bytes_written,
1307                              error);
1308 #else  /* !G_PLATFORM_WIN32 */
1309   if (have_broken_filenames ())
1310     return g_locale_from_utf8 (utf8string, len,
1311                                bytes_read, bytes_written,
1312                                error);
1313   else
1314     return strdup_len (utf8string, len, bytes_read, bytes_written, error);
1315 #endif /* !G_PLATFORM_WIN32 */
1316 }
1317
1318 /* Test of haystack has the needle prefix, comparing case
1319  * insensitive. haystack may be UTF-8, but needle must
1320  * contain only ascii. */
1321 static gboolean
1322 has_case_prefix (const gchar *haystack, const gchar *needle)
1323 {
1324   const gchar *h, *n;
1325
1326   /* Eat one character at a time. */
1327   h = haystack;
1328   n = needle;
1329
1330   while (*n && *h &&
1331          g_ascii_tolower (*n) == g_ascii_tolower (*h))
1332     {
1333       n++;
1334       h++;
1335     }
1336
1337   return *n == '\0';
1338 }
1339
1340 typedef enum {
1341   UNSAFE_ALL        = 0x1,  /* Escape all unsafe characters   */
1342   UNSAFE_ALLOW_PLUS = 0x2,  /* Allows '+'  */
1343   UNSAFE_PATH       = 0x4,  /* Allows '/' and '?' and '&' and '='  */
1344   UNSAFE_DOS_PATH   = 0x8,  /* Allows '/' and '?' and '&' and '=' and ':' */
1345   UNSAFE_HOST       = 0x10, /* Allows '/' and ':' and '@' */
1346   UNSAFE_SLASHES    = 0x20  /* Allows all characters except for '/' and '%' */
1347 } UnsafeCharacterSet;
1348
1349 static const guchar acceptable[96] = {
1350   /* A table of the ASCII chars from space (32) to DEL (127) */
1351   /*      !    "    #    $    %    &    '    (    )    *    +    ,    -    .    / */
1352   0x00,0x3F,0x20,0x20,0x20,0x00,0x2C,0x3F,0x3F,0x3F,0x3F,0x22,0x20,0x3F,0x3F,0x1C,
1353   /* 0    1    2    3    4    5    6    7    8    9    :    ;    <    =    >    ? */
1354   0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x38,0x20,0x20,0x2C,0x20,0x2C,
1355   /* @    A    B    C    D    E    F    G    H    I    J    K    L    M    N    O */
1356   0x30,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,
1357   /* P    Q    R    S    T    U    V    W    X    Y    Z    [    \    ]    ^    _ */
1358   0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x20,0x20,0x20,0x20,0x3F,
1359   /* `    a    b    c    d    e    f    g    h    i    j    k    l    m    n    o */
1360   0x20,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,
1361   /* p    q    r    s    t    u    v    w    x    y    z    {    |    }    ~  DEL */
1362   0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x20,0x20,0x20,0x3F,0x20
1363 };
1364
1365 static const gchar hex[16] = "0123456789ABCDEF";
1366
1367 /* Note: This escape function works on file: URIs, but if you want to
1368  * escape something else, please read RFC-2396 */
1369 static gchar *
1370 g_escape_uri_string (const gchar *string,
1371                      UnsafeCharacterSet mask)
1372 {
1373 #define ACCEPTABLE(a) ((a)>=32 && (a)<128 && (acceptable[(a)-32] & use_mask))
1374
1375   const gchar *p;
1376   gchar *q;
1377   gchar *result;
1378   int c;
1379   gint unacceptable;
1380   UnsafeCharacterSet use_mask;
1381
1382   g_return_val_if_fail (mask == UNSAFE_ALL
1383                         || mask == UNSAFE_ALLOW_PLUS
1384                         || mask == UNSAFE_PATH
1385                         || mask == UNSAFE_DOS_PATH
1386                         || mask == UNSAFE_HOST
1387                         || mask == UNSAFE_SLASHES, NULL);
1388
1389   unacceptable = 0;
1390   use_mask = mask;
1391   for (p = string; *p != '\0'; p++)
1392     {
1393       c = (guchar) *p;
1394       if (!ACCEPTABLE (c))
1395         unacceptable++;
1396     }
1397
1398   result = g_malloc (p - string + unacceptable * 2 + 1);
1399
1400   use_mask = mask;
1401   for (q = result, p = string; *p != '\0'; p++)
1402     {
1403       c = (guchar) *p;
1404
1405       if (!ACCEPTABLE (c))
1406         {
1407           *q++ = '%'; /* means hex coming */
1408           *q++ = hex[c >> 4];
1409           *q++ = hex[c & 15];
1410         }
1411       else
1412         *q++ = *p;
1413     }
1414
1415   *q = '\0';
1416
1417   return result;
1418 }
1419
1420
1421 static gchar *
1422 g_escape_file_uri (const gchar *hostname,
1423                    const gchar *pathname)
1424 {
1425   char *escaped_hostname = NULL;
1426   char *escaped_path;
1427   char *res;
1428
1429 #ifdef G_OS_WIN32
1430   char *p, *backslash;
1431
1432   /* Turn backslashes into forward slashes. That's what Netscape
1433    * does, and they are actually more or less equivalent in Windows.
1434    */
1435
1436   pathname = g_strdup (pathname);
1437   p = (char *) pathname;
1438
1439   while ((backslash = strchr (p, '\\')) != NULL)
1440     {
1441       *backslash = '/';
1442       p = backslash + 1;
1443     }
1444 #endif
1445
1446   if (hostname && *hostname != '\0')
1447     {
1448       escaped_hostname = g_escape_uri_string (hostname, UNSAFE_HOST);
1449     }
1450
1451   escaped_path = g_escape_uri_string (pathname, UNSAFE_DOS_PATH);
1452
1453   res = g_strconcat ("file://",
1454                      (escaped_hostname) ? escaped_hostname : "",
1455                      (*escaped_path != '/') ? "/" : "",
1456                      escaped_path,
1457                      NULL);
1458
1459 #ifdef G_OS_WIN32
1460   g_free ((char *) pathname);
1461 #endif
1462
1463   g_free (escaped_hostname);
1464   g_free (escaped_path);
1465
1466   return res;
1467 }
1468
1469 static int
1470 unescape_character (const char *scanner)
1471 {
1472   int first_digit;
1473   int second_digit;
1474
1475   first_digit = g_ascii_xdigit_value (scanner[0]);
1476   if (first_digit < 0)
1477     return -1;
1478
1479   second_digit = g_ascii_xdigit_value (scanner[1]);
1480   if (second_digit < 0)
1481     return -1;
1482
1483   return (first_digit << 4) | second_digit;
1484 }
1485
1486 static gchar *
1487 g_unescape_uri_string (const char *escaped,
1488                        int         len,
1489                        const char *illegal_escaped_characters,
1490                        gboolean    ascii_must_not_be_escaped)
1491 {
1492   const gchar *in, *in_end;
1493   gchar *out, *result;
1494   int c;
1495
1496   if (escaped == NULL)
1497     return NULL;
1498
1499   if (len < 0)
1500     len = strlen (escaped);
1501
1502   result = g_malloc (len + 1);
1503
1504   out = result;
1505   for (in = escaped, in_end = escaped + len; in < in_end; in++)
1506     {
1507       c = *in;
1508
1509       if (c == '%')
1510         {
1511           /* catch partial escape sequences past the end of the substring */
1512           if (in + 3 > in_end)
1513             break;
1514
1515           c = unescape_character (in + 1);
1516
1517           /* catch bad escape sequences and NUL characters */
1518           if (c <= 0)
1519             break;
1520
1521           /* catch escaped ASCII */
1522           if (ascii_must_not_be_escaped && c <= 0x7F)
1523             break;
1524
1525           /* catch other illegal escaped characters */
1526           if (strchr (illegal_escaped_characters, c) != NULL)
1527             break;
1528
1529           in += 2;
1530         }
1531
1532       *out++ = c;
1533     }
1534
1535   g_assert (out - result <= len);
1536   *out = '\0';
1537
1538   if (in != in_end || !g_utf8_validate (result, -1, NULL))
1539     {
1540       g_free (result);
1541       return NULL;
1542     }
1543
1544   return result;
1545 }
1546
1547 static gboolean
1548 is_escalphanum (gunichar c)
1549 {
1550   return c > 0x7F || g_ascii_isalnum (c);
1551 }
1552
1553 static gboolean
1554 is_escalpha (gunichar c)
1555 {
1556   return c > 0x7F || g_ascii_isalpha (c);
1557 }
1558
1559 /* allows an empty string */
1560 static gboolean
1561 hostname_validate (const char *hostname)
1562 {
1563   const char *p;
1564   gunichar c, first_char, last_char;
1565
1566   p = hostname;
1567   if (*p == '\0')
1568     return TRUE;
1569   do
1570     {
1571       /* read in a label */
1572       c = g_utf8_get_char (p);
1573       p = g_utf8_next_char (p);
1574       if (!is_escalphanum (c))
1575         return FALSE;
1576       first_char = c;
1577       do
1578         {
1579           last_char = c;
1580           c = g_utf8_get_char (p);
1581           p = g_utf8_next_char (p);
1582         }
1583       while (is_escalphanum (c) || c == '-');
1584       if (last_char == '-')
1585         return FALSE;
1586
1587       /* if that was the last label, check that it was a toplabel */
1588       if (c == '\0' || (c == '.' && *p == '\0'))
1589         return is_escalpha (first_char);
1590     }
1591   while (c == '.');
1592   return FALSE;
1593 }
1594
1595 /**
1596  * g_filename_from_uri:
1597  * @uri: a uri describing a filename (escaped, encoded in UTF-8).
1598  * @hostname: Location to store hostname for the URI, or %NULL.
1599  *            If there is no hostname in the URI, %NULL will be
1600  *            stored in this location.
1601  * @error: location to store the error occuring, or %NULL to ignore
1602  *         errors. Any of the errors in #GConvertError may occur.
1603  *
1604  * Converts an escaped UTF-8 encoded URI to a local filename in the
1605  * encoding used for filenames.
1606  *
1607  * Return value: a newly-allocated string holding the resulting
1608  *               filename, or %NULL on an error.
1609  **/
1610 gchar *
1611 g_filename_from_uri (const char *uri,
1612                      char      **hostname,
1613                      GError    **error)
1614 {
1615   const char *path_part;
1616   const char *host_part;
1617   char *unescaped_hostname;
1618   char *result;
1619   char *filename;
1620   int offs;
1621 #ifdef G_OS_WIN32
1622   char *p, *slash;
1623 #endif
1624
1625   if (hostname)
1626     *hostname = NULL;
1627
1628   if (!has_case_prefix (uri, "file:/"))
1629     {
1630       g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_BAD_URI,
1631                    _("The URI '%s' is not an absolute URI using the file scheme"),
1632                    uri);
1633       return NULL;
1634     }
1635
1636   path_part = uri + strlen ("file:");
1637
1638   if (strchr (path_part, '#') != NULL)
1639     {
1640       g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_BAD_URI,
1641                    _("The local file URI '%s' may not include a '#'"),
1642                    uri);
1643       return NULL;
1644     }
1645
1646   if (has_case_prefix (path_part, "///"))
1647     path_part += 2;
1648   else if (has_case_prefix (path_part, "//"))
1649     {
1650       path_part += 2;
1651       host_part = path_part;
1652
1653       path_part = strchr (path_part, '/');
1654
1655       if (path_part == NULL)
1656         {
1657           g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_BAD_URI,
1658                        _("The URI '%s' is invalid"),
1659                        uri);
1660           return NULL;
1661         }
1662
1663       unescaped_hostname = g_unescape_uri_string (host_part, path_part - host_part, "", TRUE);
1664
1665       if (unescaped_hostname == NULL ||
1666           !hostname_validate (unescaped_hostname))
1667         {
1668           g_free (unescaped_hostname);
1669           g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_BAD_URI,
1670                        _("The hostname of the URI '%s' is invalid"),
1671                        uri);
1672           return NULL;
1673         }
1674
1675       if (hostname)
1676         *hostname = unescaped_hostname;
1677       else
1678         g_free (unescaped_hostname);
1679     }
1680
1681   filename = g_unescape_uri_string (path_part, -1, "/", FALSE);
1682
1683   if (filename == NULL)
1684     {
1685       g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_BAD_URI,
1686                    _("The URI '%s' contains invalidly escaped characters"),
1687                    uri);
1688       return NULL;
1689     }
1690
1691   offs = 0;
1692 #ifdef G_OS_WIN32
1693   /* Drop localhost */
1694   if (hostname && *hostname != NULL &&
1695       g_ascii_strcasecmp (*hostname, "localhost") == 0)
1696     {
1697       g_free (*hostname);
1698       *hostname = NULL;
1699     }
1700
1701   /* Turn slashes into backslashes, because that's the canonical spelling */
1702   p = filename;
1703   while ((slash = strchr (p, '/')) != NULL)
1704     {
1705       *slash = '\\';
1706       p = slash + 1;
1707     }
1708
1709   /* Windows URIs with a drive letter can be like "file://host/c:/foo"
1710    * or "file://host/c|/foo" (some Netscape versions). In those cases, start
1711    * the filename from the drive letter.
1712    */
1713   if (g_ascii_isalpha (filename[1]))
1714     {
1715       if (filename[2] == ':')
1716         offs = 1;
1717       else if (filename[2] == '|')
1718         {
1719           filename[2] = ':';
1720           offs = 1;
1721         }
1722     }
1723 #endif
1724
1725   result = g_filename_from_utf8 (filename + offs, -1, NULL, NULL, error);
1726   g_free (filename);
1727
1728   return result;
1729 }
1730
1731 /**
1732  * g_filename_to_uri:
1733  * @filename: an absolute filename specified in the encoding
1734  *            used for filenames by the operating system.
1735  * @hostname: A UTF-8 encoded hostname, or %NULL for none.
1736  * @error: location to store the error occuring, or %NULL to ignore
1737  *         errors. Any of the errors in #GConvertError may occur.
1738  *
1739  * Converts an absolute filename to an escaped UTF-8 encoded URI.
1740  *
1741  * Return value: a newly-allocated string holding the resulting
1742  *               URI, or %NULL on an error.
1743  **/
1744 gchar *
1745 g_filename_to_uri   (const char *filename,
1746                      const char *hostname,
1747                      GError    **error)
1748 {
1749   char *escaped_uri;
1750   char *utf8_filename;
1751
1752   g_return_val_if_fail (filename != NULL, NULL);
1753
1754   if (!g_path_is_absolute (filename))
1755     {
1756       g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_NOT_ABSOLUTE_PATH,
1757                    _("The pathname '%s' is not an absolute path"),
1758                    filename);
1759       return NULL;
1760     }
1761
1762   if (hostname &&
1763       !(g_utf8_validate (hostname, -1, NULL)
1764         && hostname_validate (hostname)))
1765     {
1766       g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1767                    _("Invalid hostname"));
1768       return NULL;
1769     }
1770
1771   utf8_filename = g_filename_to_utf8 (filename, -1, NULL, NULL, error);
1772   if (utf8_filename == NULL)
1773     return NULL;
1774
1775 #ifdef G_OS_WIN32
1776   /* Don't use localhost unnecessarily */
1777   if (hostname && g_ascii_strcasecmp (hostname, "localhost") == 0)
1778     hostname = NULL;
1779 #endif
1780
1781   escaped_uri = g_escape_file_uri (hostname,
1782                                    utf8_filename);
1783   g_free (utf8_filename);
1784
1785   return escaped_uri;
1786 }
1787