glib/gconvert.c

   1 /* GLIB - Library of useful routines for C programming
   2  *
   3  * gconvert.c: Convert between character sets using iconv
   4  * Copyright Red Hat Inc., 2000
   5  * Authors: Havoc Pennington <hp@redhat.com>, Owen Taylor <otaylor@redhat.com
   6  *
   7  * This library is free software; you can redistribute it and/or
   8  * modify it under the terms of the GNU Lesser General Public
   9  * License as published by the Free Software Foundation; either
  10  * version 2 of the License, or (at your option) any later version.
  11  *
  12  * This library is distributed in the hope that it will be useful,
  13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15  * Lesser General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU Lesser General Public
  18  * License along with this library; if not, write to the
  19  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  20  * Boston, MA 02111-1307, USA.
  21  */
  22
  23 #include <iconv.h>
  24 #include <errno.h>
  25 #include <string.h>
  26 #include <stdlib.h>
  27
  28 #include "glib.h"
  29 #include "config.h"
  30
  31 #ifdef G_PLATFORM_WIN32
  32 #define STRICT
  33 #include <windows.h>
  34 #undef STRICT
  35 #endif
  36
  37 #include "glibintl.h"
  38
  39 GQuark
  40 g_convert_error_quark()
  41 {
  42   static GQuark quark;
  43   if (!quark)
  44     quark = g_quark_from_static_string ("g_convert_error");
  45
  46   return quark;
  47 }
  48
  49 #if defined(USE_LIBICONV) && !defined (_LIBICONV_H)
  50 #error libiconv in use but included iconv.h not from libiconv
  51 #endif
  52 #if !defined(USE_LIBICONV) && defined (_LIBICONV_H)
  53 #error libiconv not in use but included iconv.h is from libiconv
  54 #endif
  55
  56 /**
  57  * g_iconv_open:
  58  * @to_codeset: destination codeset
  59  * @from_codeset: source codeset
  60  *
  61  * Same as the standard UNIX routine iconv_open(), but
  62  * may be implemented via libiconv on UNIX flavors that lack
  63  * a native implementation.
  64  *
  65  * GLib provides g_convert() and g_locale_to_utf8() which are likely
  66  * more convenient than the raw iconv wrappers.
  67  *
  68  * Return value: a "conversion descriptor"
  69  **/
  70 GIConv
  71 g_iconv_open (const gchar  *to_codeset,
  72               const gchar  *from_codeset)
  73 {
  74   iconv_t cd = iconv_open (to_codeset, from_codeset);
  75
  76   return (GIConv)cd;
  77 }
  78
  79 /**
  80  * g_iconv:
  81  * @converter: conversion descriptor from g_iconv_open()
  82  * @inbuf: bytes to convert
  83  * @inbytes_left: inout parameter, bytes remaining to convert in @inbuf
  84  * @outbuf: converted output bytes
  85  * @outbytes_left: inout parameter, bytes available to fill in @outbuf
  86  *
  87  * Same as the standard UNIX routine iconv(), but
  88  * may be implemented via libiconv on UNIX flavors that lack
  89  * a native implementation.
  90  *
  91  * GLib provides g_convert() and g_locale_to_utf8() which are likely
  92  * more convenient than the raw iconv wrappers.
  93  *
  94  * Return value: count of non-reversible conversions, or -1 on error
  95  **/
  96 size_t
  97 g_iconv (GIConv   converter,
  98          gchar  **inbuf,
  99          gsize   *inbytes_left,
 100          gchar  **outbuf,
 101          gsize   *outbytes_left)
 102 {
 103   iconv_t cd = (iconv_t)converter;
 104
 105   return iconv (cd, inbuf, inbytes_left, outbuf, outbytes_left);
 106 }
 107
 108 /**
 109  * g_iconv_close:
 110  * @converter: a conversion descriptor from g_iconv_open()
 111  *
 112  * Same as the standard UNIX routine iconv_close(), but
 113  * may be implemented via libiconv on UNIX flavors that lack
 114  * a native implementation. Should be called to clean up
 115  * the conversion descriptor from iconv_open() when
 116  * you are done converting things.
 117  *
 118  * GLib provides g_convert() and g_locale_to_utf8() which are likely
 119  * more convenient than the raw iconv wrappers.
 120  *
 121  * Return value: -1 on error, 0 on success
 122  **/
 123 gint
 124 g_iconv_close (GIConv converter)
 125 {
 126   iconv_t cd = (iconv_t)converter;
 127
 128   return iconv_close (cd);
 129 }
 130
 131 static GIConv
 132 open_converter (const gchar *to_codeset,
 133                 const gchar *from_codeset,
 134                 GError     **error)
 135 {
 136   GIConv cd = g_iconv_open (to_codeset, from_codeset);
 137
 138   if (cd == (iconv_t) -1)
 139     {
 140       /* Something went wrong.  */
 141       if (errno == EINVAL)
 142         g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_NO_CONVERSION,
 143                      _("Conversion from character set `%s' to `%s' is not supported"),
 144                      from_codeset, to_codeset);
 145       else
 146         g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_FAILED,
 147                      _("Could not open converter from `%s' to `%s': %s"),
 148                      from_codeset, to_codeset, strerror (errno));
 149     }
 150
 151   return cd;
 152
 153 }
 154
 155 /**
 156  * g_convert:
 157  * @str:           the string to convert
 158  * @len:           the length of the string
 159  * @to_codeset:    name of character set into which to convert @str
 160  * @from_codeset:  character set of @str.
 161  * @bytes_read:    location to store the number of bytes in the
 162  *                 input string that were successfully converted, or %NULL.
 163  *                 Even if the conversion was succesful, this may be
 164  *                 less than len if there were partial characters
 165  *                 at the end of the input. If the error
 166  *                 G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value
 167  *                 stored will the byte fofset after the last valid
 168  *                 input sequence.
 169  * @bytes_written: the stored in the output buffer (not including the
 170  *                 terminating nul.
 171  * @error:         location to store the error occuring, or %NULL to ignore
 172  *                 errors. Any of the errors in #GConvertError may occur.
 173  *
 174  * Convert a string from one character set to another.
 175  *
 176  * Return value: If the conversion was successful, a newly allocated
 177  *               NUL-terminated string, which must be freed with
 178  *               g_free. Otherwise %NULL and @error will be set.
 179  **/
 180 gchar*
 181 g_convert (const gchar *str,
 182            gssize       len,
 183            const gchar *to_codeset,
 184            const gchar *from_codeset,
 185            gsize       *bytes_read,
 186            gsize       *bytes_written,
 187            GError     **error)
 188 {
 189   gchar *res;
 190   GIConv cd;
 191
 192   g_return_val_if_fail (str != NULL, NULL);
 193   g_return_val_if_fail (to_codeset != NULL, NULL);
 194   g_return_val_if_fail (from_codeset != NULL, NULL);
 195
 196   cd = open_converter (to_codeset, from_codeset, error);
 197
 198   if (cd == (GIConv) -1)
 199     {
 200       if (bytes_read)
 201         *bytes_read = 0;
 202
 203       if (bytes_written)
 204         *bytes_written = 0;
 205
 206       return NULL;
 207     }
 208
 209   res = g_convert_with_iconv (str, len, cd,
 210                               bytes_read, bytes_written,
 211                               error);
 212
 213   g_iconv_close (cd);
 214
 215   return res;
 216 }
 217
 218 /**
 219  * g_convert_with_iconv:
 220  * @str:           the string to convert
 221  * @len:           the length of the string
 222  * @converter:     conversion descriptor from g_iconv_open()
 223  * @bytes_read:    location to store the number of bytes in the
 224  *                 input string that were successfully converted, or %NULL.
 225  *                 Even if the conversion was succesful, this may be
 226  *                 less than len if there were partial characters
 227  *                 at the end of the input. If the error
 228  *                 G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value
 229  *                 stored will the byte fofset after the last valid
 230  *                 input sequence.
 231  * @bytes_written: the stored in the output buffer (not including the
 232  *                 terminating nul.
 233  * @error:         location to store the error occuring, or %NULL to ignore
 234  *                 errors. Any of the errors in #GConvertError may occur.
 235  *
 236  * Convert a string from one character set to another.
 237  *
 238  * Return value: If the conversion was successful, a newly allocated
 239  *               NUL-terminated string, which must be freed with
 240  *               g_free. Otherwise %NULL and @error will be set.
 241  **/
 242 gchar*
 243 g_convert_with_iconv (const gchar *str,
 244                       gssize       len,
 245                       GIConv       converter,
 246                       gsize       *bytes_read,
 247                       gsize       *bytes_written,
 248                       GError     **error)
 249 {
 250   gchar *dest;
 251   gchar *outp;
 252   const gchar *p;
 253   gsize inbytes_remaining;
 254   gsize outbytes_remaining;
 255   gsize err;
 256   gsize outbuf_size;
 257   gboolean have_error = FALSE;
 258
 259   g_return_val_if_fail (str != NULL, NULL);
 260   g_return_val_if_fail (converter != (GIConv) -1, NULL);
 261
 262   if (len < 0)
 263     len = strlen (str);
 264
 265   p = str;
 266   inbytes_remaining = len;
 267   outbuf_size = len + 1; /* + 1 for nul in case len == 1 */
 268
 269   outbytes_remaining = outbuf_size - 1; /* -1 for nul */
 270   outp = dest = g_malloc (outbuf_size);
 271
 272  again:
 273
 274   err = g_iconv (converter, (char **)&p, &inbytes_remaining, &outp, &outbytes_remaining);
 275
 276   if (err == (size_t) -1)
 277     {
 278       switch (errno)
 279         {
 280         case EINVAL:
 281           /* Incomplete text, do not report an error */
 282           break;
 283         case E2BIG:
 284           {
 285             size_t used = outp - dest;
 286
 287             outbuf_size *= 2;
 288             dest = g_realloc (dest, outbuf_size);
 289
 290             outp = dest + used;
 291             outbytes_remaining = outbuf_size - used - 1; /* -1 for nul */
 292
 293             goto again;
 294           }
 295         case EILSEQ:
 296           g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
 297                        _("Invalid byte sequence in conversion input"));
 298           have_error = TRUE;
 299           break;
 300         default:
 301           g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_FAILED,
 302                        _("Error during conversion: %s"),
 303                        strerror (errno));
 304           have_error = TRUE;
 305           break;
 306         }
 307     }
 308
 309   *outp = '\0';
 310
 311   if (bytes_read)
 312     *bytes_read = p - str;
 313   else
 314     {
 315       if ((p - str) != len)
 316         {
 317           if (!have_error)
 318             {
 319               g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT,
 320                            _("Partial character sequence at end of input"));
 321               have_error = TRUE;
 322             }
 323         }
 324     }
 325
 326   if (bytes_written)
 327     *bytes_written = outp - dest;       /* Doesn't include '\0' */
 328
 329   if (have_error)
 330     {
 331       g_free (dest);
 332       return NULL;
 333     }
 334   else
 335     return dest;
 336 }
 337
 338 /**
 339  * g_convert_with_fallback:
 340  * @str:          the string to convert
 341  * @len:          the length of the string
 342  * @to_codeset:   name of character set into which to convert @str
 343  * @from_codeset: character set of @str.
 344  * @fallback:     UTF-8 string to use in place of character not
 345  *                present in the target encoding. (This must be
 346  *                in the target encoding), if %NULL, characters
 347  *                not in the target encoding will be represented
 348  *                as Unicode escapes \x{XXXX} or \x{XXXXXX}.
 349  * @bytes_read:   location to store the number of bytes in the
 350  *                input string that were successfully converted, or %NULL.
 351  *                Even if the conversion was succesful, this may be
 352  *                less than len if there were partial characters
 353  *                at the end of the input.
 354  * @bytes_written: the stored in the output buffer (not including the
 355  *                 terminating nul.
 356  * @error:        location to store the error occuring, or %NULL to ignore
 357  *                errors. Any of the errors in #GConvertError may occur.
 358  *
 359  * Convert a string from one character set to another, possibly
 360  * including fallback sequences for characters not representable
 361  * in the output. Note that it is not guaranteed that the specification
 362  * for the fallback sequences in @fallback will be honored. Some
 363  * systems may do a approximate conversion from @from_codeset
 364  * to @to_codeset in their iconv() functions, in which case GLib
 365  * will simply return that approximate conversion.
 366  *
 367  * Return value: If the conversion was successful, a newly allocated
 368  *               NUL-terminated string, which must be freed with
 369  *               g_free. Otherwise %NULL and @error will be set.
 370  **/
 371 gchar*
 372 g_convert_with_fallback (const gchar *str,
 373                          gssize       len,
 374                          const gchar *to_codeset,
 375                          const gchar *from_codeset,
 376                          gchar       *fallback,
 377                          gsize       *bytes_read,
 378                          gsize       *bytes_written,
 379                          GError     **error)
 380 {
 381   gchar *utf8;
 382   gchar *dest;
 383   gchar *outp;
 384   const gchar *insert_str = NULL;
 385   const gchar *p;
 386   gsize inbytes_remaining;
 387   const gchar *save_p = NULL;
 388   gsize save_inbytes = 0;
 389   gsize outbytes_remaining;
 390   gsize err;
 391   GIConv cd;
 392   gsize outbuf_size;
 393   gboolean have_error = FALSE;
 394   gboolean done = FALSE;
 395
 396   GError *local_error = NULL;
 397
 398   g_return_val_if_fail (str != NULL, NULL);
 399   g_return_val_if_fail (to_codeset != NULL, NULL);
 400   g_return_val_if_fail (from_codeset != NULL, NULL);
 401
 402   if (len < 0)
 403     len = strlen (str);
 404
 405   /* Try an exact conversion; we only proceed if this fails
 406    * due to an illegal sequence in the input string.
 407    */
 408   dest = g_convert (str, len, to_codeset, from_codeset,
 409                     bytes_read, bytes_written, &local_error);
 410   if (!local_error)
 411     return dest;
 412
 413   if (!g_error_matches (local_error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE))
 414     {
 415       g_propagate_error (error, local_error);
 416       return NULL;
 417     }
 418   else
 419     g_error_free (local_error);
 420
 421   local_error = NULL;
 422
 423   /* No go; to proceed, we need a converter from "UTF-8" to
 424    * to_codeset, and the string as UTF-8.
 425    */
 426   cd = open_converter (to_codeset, "UTF-8", error);
 427   if (cd == (GIConv) -1)
 428     {
 429       if (bytes_read)
 430         *bytes_read = 0;
 431
 432       if (bytes_written)
 433         *bytes_written = 0;
 434
 435       return NULL;
 436     }
 437
 438   utf8 = g_convert (str, len, "UTF-8", from_codeset,
 439                     bytes_read, &inbytes_remaining, error);
 440   if (!utf8)
 441     return NULL;
 442
 443   /* Now the heart of the code. We loop through the UTF-8 string, and
 444    * whenever we hit an offending character, we form fallback, convert
 445    * the fallback to the target codeset, and then go back to
 446    * converting the original string after finishing with the fallback.
 447    *
 448    * The variables save_p and save_inbytes store the input state
 449    * for the original string while we are converting the fallback
 450    */
 451   p = utf8;
 452
 453   outbuf_size = len + 1; /* + 1 for nul in case len == 1 */
 454   outbytes_remaining = outbuf_size - 1; /* -1 for nul */
 455   outp = dest = g_malloc (outbuf_size);
 456
 457   while (!done && !have_error)
 458     {
 459       size_t inbytes_tmp = inbytes_remaining;
 460       err = g_iconv (cd, (char **)&p, &inbytes_tmp, &outp, &outbytes_remaining);
 461       inbytes_remaining = inbytes_tmp;
 462
 463       if (err == (size_t) -1)
 464         {
 465           switch (errno)
 466             {
 467             case EINVAL:
 468               g_assert_not_reached();
 469               break;
 470             case E2BIG:
 471               {
 472                 size_t used = outp - dest;
 473
 474                 outbuf_size *= 2;
 475                 dest = g_realloc (dest, outbuf_size);
 476
 477                 outp = dest + used;
 478                 outbytes_remaining = outbuf_size - used - 1; /* -1 for nul */
 479
 480                 break;
 481               }
 482             case EILSEQ:
 483               if (save_p)
 484                 {
 485                   /* Error converting fallback string - fatal
 486                    */
 487                   g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
 488                                _("Cannot convert fallback '%s' to codeset '%s'"),
 489                                insert_str, to_codeset);
 490                   have_error = TRUE;
 491                   break;
 492                 }
 493               else
 494                 {
 495                   if (!fallback)
 496                     {
 497                       gunichar ch = g_utf8_get_char (p);
 498                       insert_str = g_strdup_printf ("\\x{%0*X}",
 499                                                     (ch < 0x10000) ? 4 : 6,
 500                                                     ch);
 501                     }
 502                   else
 503                     insert_str = fallback;
 504
 505                   save_p = g_utf8_next_char (p);
 506                   save_inbytes = inbytes_remaining - (save_p - p);
 507                   p = insert_str;
 508                   inbytes_remaining = strlen (p);
 509                 }
 510               break;
 511             default:
 512               g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_FAILED,
 513                            _("Error during conversion: %s"),
 514                            strerror (errno));
 515               have_error = TRUE;
 516               break;
 517             }
 518         }
 519       else
 520         {
 521           if (save_p)
 522             {
 523               if (!fallback)
 524                 g_free ((gchar *)insert_str);
 525               p = save_p;
 526               inbytes_remaining = save_inbytes;
 527               save_p = NULL;
 528             }
 529           else
 530             done = TRUE;
 531         }
 532     }
 533
 534   /* Cleanup
 535    */
 536   *outp = '\0';
 537
 538   g_iconv_close (cd);
 539
 540   if (bytes_written)
 541     *bytes_written = outp - str;        /* Doesn't include '\0' */
 542
 543   g_free (utf8);
 544
 545   if (have_error)
 546     {
 547       if (save_p && !fallback)
 548         g_free ((gchar *)insert_str);
 549       g_free (dest);
 550       return NULL;
 551     }
 552   else
 553     return dest;
 554 }
 555
 556 /*
 557  * g_locale_to_utf8
 558  *
 559  *
 560  */
 561
 562 static gchar *
 563 strdup_len (const gchar *string,
 564             gssize       len,
 565             gsize       *bytes_written,
 566             gsize       *bytes_read,
 567             GError      **error)
 568
 569 {
 570   gsize real_len;
 571
 572   if (!g_utf8_validate (string, -1, NULL))
 573     {
 574       if (bytes_read)
 575         *bytes_read = 0;
 576       if (bytes_written)
 577         *bytes_written = 0;
 578
 579       g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
 580                    _("Invalid byte sequence in conversion input"));
 581       return NULL;
 582     }
 583
 584   if (len < 0)
 585     real_len = strlen (string);
 586   else
 587     {
 588       real_len = 0;
 589
 590       while (real_len < len && string[real_len])
 591         real_len++;
 592     }
 593
 594   if (bytes_read)
 595     *bytes_read = real_len;
 596   if (bytes_written)
 597     *bytes_written = real_len;
 598
 599   return g_strndup (string, real_len);
 600 }
 601
 602 /**
 603  * g_locale_to_utf8:
 604  * @opsysstring:   a string in the encoding of the current locale
 605  * @len:           the length of the string, or -1 if the string is
 606  *                 NULL-terminated.
 607  * @bytes_read:    location to store the number of bytes in the
 608  *                 input string that were successfully converted, or %NULL.
 609  *                 Even if the conversion was succesful, this may be
 610  *                 less than len if there were partial characters
 611  *                 at the end of the input. If the error
 612  *                 G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value
 613  *                 stored will the byte fofset after the last valid
 614  *                 input sequence.
 615  * @bytes_written: the stored in the output buffer (not including the
 616  *                 terminating nul.
 617  * @error: location to store the error occuring, or %NULL to ignore
 618  *                 errors. Any of the errors in #GConvertError may occur.
 619  *
 620  * Converts a string which is in the encoding used for strings by
 621  * the C runtime (usually the same as that used by the operating
 622  * system) in the current locale into a UTF-8 string.
 623  *
 624  * Return value: The converted string, or %NULL on an error.
 625  **/
 626 gchar *
 627 g_locale_to_utf8 (const gchar  *opsysstring,
 628                   gssize        len,
 629                   gsize        *bytes_read,
 630                   gsize        *bytes_written,
 631                   GError      **error)
 632 {
 633 #ifdef G_PLATFORM_WIN32
 634
 635   gint i, clen, total_len, wclen, first;
 636   wchar_t *wcs, wc;
 637   gchar *result, *bp;
 638   const wchar_t *wcp;
 639
 640   if (len == -1)
 641     len = strlen (opsysstring);
 642
 643   wcs = g_new (wchar_t, len);
 644   wclen = MultiByteToWideChar (CP_ACP, 0, opsysstring, len, wcs, len);
 645
 646   wcp = wcs;
 647   total_len = 0;
 648   for (i = 0; i < wclen; i++)
 649     {
 650       wc = *wcp++;
 651
 652       if (wc < 0x80)
 653         total_len += 1;
 654       else if (wc < 0x800)
 655         total_len += 2;
 656       else if (wc < 0x10000)
 657         total_len += 3;
 658       else if (wc < 0x200000)
 659         total_len += 4;
 660       else if (wc < 0x4000000)
 661         total_len += 5;
 662       else
 663         total_len += 6;
 664     }
 665
 666   result = g_malloc (total_len + 1);
 667
 668   wcp = wcs;
 669   bp = result;
 670   for (i = 0; i < wclen; i++)
 671     {
 672       wc = *wcp++;
 673
 674       if (wc < 0x80)
 675         {
 676           first = 0;
 677           clen = 1;
 678         }
 679       else if (wc < 0x800)
 680         {
 681           first = 0xc0;
 682           clen = 2;
 683         }
 684       else if (wc < 0x10000)
 685         {
 686           first = 0xe0;
 687           clen = 3;
 688         }
 689       else if (wc < 0x200000)
 690         {
 691           first = 0xf0;
 692           clen = 4;
 693         }
 694       else if (wc < 0x4000000)
 695         {
 696           first = 0xf8;
 697           clen = 5;
 698         }
 699       else
 700         {
 701           first = 0xfc;
 702           clen = 6;
 703         }
 704
 705       /* Woo-hoo! */
 706       switch (clen)
 707         {
 708         case 6: bp[5] = (wc & 0x3f) | 0x80; wc >>= 6; /* Fall through */
 709         case 5: bp[4] = (wc & 0x3f) | 0x80; wc >>= 6; /* Fall through */
 710         case 4: bp[3] = (wc & 0x3f) | 0x80; wc >>= 6; /* Fall through */
 711         case 3: bp[2] = (wc & 0x3f) | 0x80; wc >>= 6; /* Fall through */
 712         case 2: bp[1] = (wc & 0x3f) | 0x80; wc >>= 6; /* Fall through */
 713         case 1: bp[0] = wc | first;
 714         }
 715
 716       bp += clen;
 717     }
 718   *bp = 0;
 719
 720   g_free (wcs);
 721
 722   if (bytes_read)
 723     *bytes_read = len;
 724   if (bytes_written)
 725     *bytes_written = total_len;
 726
 727   return result;
 728
 729 #else  /* !G_PLATFORM_WIN32 */
 730
 731   const char *charset;
 732
 733   if (g_get_charset (&charset))
 734     return strdup_len (opsysstring, len, bytes_read, bytes_written, error);
 735   else
 736     return g_convert (opsysstring, len,
 737                       "UTF-8", charset, bytes_read, bytes_written, error);
 738
 739 #endif /* !G_PLATFORM_WIN32 */
 740 }
 741
 742 /**
 743  * g_locale_from_utf8:
 744  * @utf8string:    a UTF-8 encoded string
 745  * @len:           the length of the string, or -1 if the string is
 746  *                 NULL-terminated.
 747  * @bytes_read:    location to store the number of bytes in the
 748  *                 input string that were successfully converted, or %NULL.
 749  *                 Even if the conversion was succesful, this may be
 750  *                 less than len if there were partial characters
 751  *                 at the end of the input. If the error
 752  *                 G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value
 753  *                 stored will the byte fofset after the last valid
 754  *                 input sequence.
 755  * @bytes_written: the stored in the output buffer (not including the
 756  *                 terminating nul.
 757  * @error: location to store the error occuring, or %NULL to ignore
 758  *                 errors. Any of the errors in #GConvertError may occur.
 759  *
 760  * Converts a string from UTF-8 to the encoding used for strings by
 761  * the C runtime (usually the same as that used by the operating
 762  * system) in the current locale.
 763  *
 764  * Return value: The converted string, or %NULL on an error.
 765  **/
 766 gchar *
 767 g_locale_from_utf8 (const gchar *utf8string,
 768                     gssize       len,
 769                     gsize       *bytes_read,
 770                     gsize       *bytes_written,
 771                     GError     **error)
 772 {
 773 #ifdef G_PLATFORM_WIN32
 774
 775   gint i, mask, clen, mblen;
 776   wchar_t *wcs, *wcp;
 777   gchar *result;
 778   guchar *cp, *end, c;
 779   gint n;
 780
 781   if (len == -1)
 782     len = strlen (utf8string);
 783
 784   /* First convert to wide chars */
 785   cp = (guchar *) utf8string;
 786   end = cp + len;
 787   n = 0;
 788   wcs = g_new (wchar_t, len + 1);
 789   wcp = wcs;
 790   while (cp != end)
 791     {
 792       mask = 0;
 793       c = *cp;
 794
 795       if (c < 0x80)
 796         {
 797           clen = 1;
 798           mask = 0x7f;
 799         }
 800       else if ((c & 0xe0) == 0xc0)
 801         {
 802           clen = 2;
 803           mask = 0x1f;
 804         }
 805       else if ((c & 0xf0) == 0xe0)
 806         {
 807           clen = 3;
 808           mask = 0x0f;
 809         }
 810       else if ((c & 0xf8) == 0xf0)
 811         {
 812           clen = 4;
 813           mask = 0x07;
 814         }
 815       else if ((c & 0xfc) == 0xf8)
 816         {
 817           clen = 5;
 818           mask = 0x03;
 819         }
 820       else if ((c & 0xfc) == 0xfc)
 821         {
 822           clen = 6;
 823           mask = 0x01;
 824         }
 825       else
 826         {
 827           g_free (wcs);
 828           return NULL;
 829         }
 830
 831       if (cp + clen > end)
 832         {
 833           g_free (wcs);
 834           return NULL;
 835         }
 836
 837       *wcp = (cp[0] & mask);
 838       for (i = 1; i < clen; i++)
 839         {
 840           if ((cp[i] & 0xc0) != 0x80)
 841             {
 842               g_free (wcs);
 843               return NULL;
 844             }
 845           *wcp <<= 6;
 846           *wcp |= (cp[i] & 0x3f);
 847         }
 848
 849       cp += clen;
 850       wcp++;
 851       n++;
 852     }
 853   if (cp != end)
 854     {
 855       g_free (wcs);
 856       return NULL;
 857     }
 858
 859   /* n is the number of wide chars constructed */
 860
 861   /* Convert to a string in the current ANSI codepage */
 862
 863   result = g_new (gchar, 3 * n + 1);
 864   mblen = WideCharToMultiByte (CP_ACP, 0, wcs, n, result, 3*n, NULL, NULL);
 865   result[mblen] = 0;
 866   g_free (wcs);
 867
 868   if (bytes_read)
 869     *bytes_read = len;
 870   if (bytes_written)
 871     *bytes_written = mblen;
 872
 873   return result;
 874
 875 #else  /* !G_PLATFORM_WIN32 */
 876
 877   const gchar *charset;
 878
 879   if (g_get_charset (&charset))
 880     return strdup_len (utf8string, len, bytes_read, bytes_written, error);
 881   else
 882     return g_convert (utf8string, len,
 883                       charset, "UTF-8", bytes_read, bytes_written, error);
 884
 885 #endif /* !G_PLATFORM_WIN32 */
 886 }
 887
 888 /**
 889  * g_filename_to_utf8:
 890  * @opsysstring:   a string in the encoding for filenames
 891  * @len:           the length of the string, or -1 if the string is
 892  *                 NULL-terminated.
 893  * @bytes_read:    location to store the number of bytes in the
 894  *                 input string that were successfully converted, or %NULL.
 895  *                 Even if the conversion was succesful, this may be
 896  *                 less than len if there were partial characters
 897  *                 at the end of the input. If the error
 898  *                 G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value
 899  *                 stored will the byte fofset after the last valid
 900  *                 input sequence.
 901  * @bytes_written: the stored in the output buffer (not including the
 902  *                 terminating nul.
 903  * @error: location to store the error occuring, or %NULL to ignore
 904  *                 errors. Any of the errors in #GConvertError may occur.
 905  *
 906  * Converts a string which is in the encoding used for filenames
 907  * into a UTF-8 string.
 908  *
 909  * Return value: The converted string, or %NULL on an error.
 910  **/
 911 gchar*
 912 g_filename_to_utf8 (const gchar *opsysstring,
 913                     gssize       len,
 914                     gsize       *bytes_read,
 915                     gsize       *bytes_written,
 916                     GError     **error)
 917 {
 918 #ifdef G_PLATFORM_WIN32
 919   return g_locale_to_utf8 (opsysstring, len,
 920                            bytes_read, bytes_written,
 921                            error);
 922 #else  /* !G_PLATFORM_WIN32 */
 923
 924   if (getenv ("G_BROKEN_FILENAMES"))
 925     return g_locale_to_utf8 (opsysstring, len,
 926                              bytes_read, bytes_written,
 927                              error);
 928   else
 929     return strdup_len (opsysstring, len, bytes_read, bytes_written, error);
 930 #endif /* !G_PLATFORM_WIN32 */
 931 }
 932
 933 /**
 934  * g_filename_from_utf8:
 935  * @utf8string:    a UTF-8 encoded string
 936  * @len:           the length of the string, or -1 if the string is
 937  *                 NULL-terminated.
 938  * @bytes_read:    location to store the number of bytes in the
 939  *                 input string that were successfully converted, or %NULL.
 940  *                 Even if the conversion was succesful, this may be
 941  *                 less than len if there were partial characters
 942  *                 at the end of the input. If the error
 943  *                 G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value
 944  *                 stored will the byte fofset after the last valid
 945  *                 input sequence.
 946  * @bytes_written: the stored in the output buffer (not including the
 947  *                 terminating nul.
 948  * @error: location to store the error occuring, or %NULL to ignore
 949  *                 errors. Any of the errors in #GConvertError may occur.
 950  *
 951  * Converts a string from UTF-8 to the encoding used for filenames.
 952  *
 953  * Return value: The converted string, or %NULL on an error.
 954  **/
 955 gchar*
 956 g_filename_from_utf8 (const gchar *utf8string,
 957                       gssize       len,
 958                       gsize       *bytes_read,
 959                       gsize       *bytes_written,
 960                       GError     **error)
 961 {
 962 #ifdef G_PLATFORM_WIN32
 963   return g_locale_from_utf8 (utf8string, len,
 964                              bytes_read, bytes_written,
 965                              error);
 966 #else  /* !G_PLATFORM_WIN32 */
 967   if (getenv ("G_BROKEN_FILENAMES"))
 968     return g_locale_from_utf8 (utf8string, len,
 969                                bytes_read, bytes_written,
 970                                error);
 971   else
 972     return strdup_len (utf8string, len, bytes_read, bytes_written, error);
 973 #endif /* !G_PLATFORM_WIN32 */
 974 }
 975
 976 /* Test of haystack has the needle prefix, comparing case
 977  * insensitive. haystack may be UTF-8, but needle must
 978  * contain only ascii. */
 979 static gboolean
 980 has_case_prefix (const gchar *haystack, const gchar *needle)
 981 {
 982   const gchar *h, *n;
 983
 984   /* Eat one character at a time. */
 985   h = haystack;
 986   n = needle;
 987
 988   while (*n && *h &&
 989          g_ascii_tolower (*n) == g_ascii_tolower (*h))
 990     {
 991       n++;
 992       h++;
 993     }
 994
 995   return *n == '\0';
 996 }
 997
 998 typedef enum {
 999   UNSAFE_ALL        = 0x1,  /* Escape all unsafe characters   */
1000   UNSAFE_ALLOW_PLUS = 0x2,  /* Allows '+'  */
1001   UNSAFE_PATH       = 0x4,  /* Allows '/' and '?' and '&' and '='  */
1002   UNSAFE_DOS_PATH   = 0x8,  /* Allows '/' and '?' and '&' and '=' and ':' */
1003   UNSAFE_HOST       = 0x10, /* Allows '/' and ':' and '@' */
1004   UNSAFE_SLASHES    = 0x20  /* Allows all characters except for '/' and '%' */
1005 } UnsafeCharacterSet;
1006
1007 static const guchar acceptable[96] = {
1008  /* X0   X1   X2   X3   X4   X5   X6   X7   X8   X9   XA   XB   XC   XD   XE   XF */
1009   0x00,0x3F,0x20,0x20,0x20,0x00,0x2C,0x3F,0x3F,0x3F,0x3F,0x22,0x20,0x3F,0x3F,0x1C, /* 2X  !"#$%&'()*+,-./   */
1010   0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x38,0x20,0x20,0x2C,0x20,0x2C, /* 3X 0123456789:;<=>?   */
1011   0x30,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F, /* 4X @ABCDEFGHIJKLMNO   */
1012   0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x20,0x20,0x20,0x20,0x3F, /* 5X PQRSTUVWXYZ[\]^_   */
1013   0x20,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F, /* 6X `abcdefghijklmno   */
1014   0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x20,0x20,0x20,0x3F,0x20  /* 7X pqrstuvwxyz{|}~DEL */
1015 };
1016
1017 static const gchar hex[16] = "0123456789ABCDEF";
1018
1019 /* Note: This escape function works on file: URIs, but if you want to
1020  * escape something else, please read RFC-2396 */
1021 static gchar *
1022 g_escape_uri_string (const gchar *string,
1023                      UnsafeCharacterSet mask)
1024 {
1025 #define ACCEPTABLE(a) ((a)>=32 && (a)<128 && (acceptable[(a)-32] & use_mask))
1026
1027   const gchar *p;
1028   gchar *q;
1029   gchar *result;
1030   int c;
1031   gint unacceptable;
1032   UnsafeCharacterSet use_mask;
1033
1034   g_return_val_if_fail (mask == UNSAFE_ALL
1035                         || mask == UNSAFE_ALLOW_PLUS
1036                         || mask == UNSAFE_PATH
1037                         || mask == UNSAFE_DOS_PATH
1038                         || mask == UNSAFE_HOST
1039                         || mask == UNSAFE_SLASHES, NULL);
1040
1041   unacceptable = 0;
1042   use_mask = mask;
1043   for (p = string; *p != '\0'; p++)
1044     {
1045       c = *p;
1046       if (!ACCEPTABLE (c))
1047         unacceptable++;
1048     }
1049
1050   result = g_malloc (p - string + unacceptable * 2 + 1);
1051
1052   use_mask = mask;
1053   for (q = result, p = string; *p != '\0'; p++)
1054     {
1055       c = (unsigned char)*p;
1056
1057       if (!ACCEPTABLE (c))
1058         {
1059           *q++ = '%'; /* means hex coming */
1060           *q++ = hex[c >> 4];
1061           *q++ = hex[c & 15];
1062         }
1063       else
1064         *q++ = *p;
1065     }
1066
1067   *q = '\0';
1068
1069   return result;
1070 }
1071
1072
1073 static gchar *
1074 g_escape_file_uri (const gchar *hostname,
1075                    const gchar *pathname)
1076 {
1077   char *escaped_hostname = NULL;
1078   char *escaped_path;
1079   char *res;
1080
1081   if (hostname && *hostname != '\0')
1082     {
1083       escaped_hostname = g_escape_uri_string (hostname, UNSAFE_HOST);
1084     }
1085
1086   escaped_path = g_escape_uri_string (pathname, UNSAFE_DOS_PATH);
1087
1088   res = g_strconcat ("file://",
1089                      (escaped_hostname) ? escaped_hostname : "",
1090                      (*escaped_path != '/') ? "/" : "",
1091                      escaped_path,
1092                      NULL);
1093
1094   g_free (escaped_hostname);
1095   g_free (escaped_path);
1096
1097   return res;
1098 }
1099
1100 static int
1101 unescape_character (const char *scanner)
1102 {
1103   int first_digit;
1104   int second_digit;
1105
1106   first_digit = g_ascii_xdigit_value (*scanner++);
1107
1108   if (first_digit < 0)
1109     return -1;
1110
1111   second_digit = g_ascii_xdigit_value (*scanner++);
1112   if (second_digit < 0)
1113     return -1;
1114
1115   return (first_digit << 4) | second_digit;
1116 }
1117
1118 static gchar *
1119 g_unescape_uri_string (const gchar *escaped,
1120                        const gchar *illegal_characters,
1121                        int          len)
1122 {
1123   const gchar *in, *in_end;
1124   gchar *out, *result;
1125   int character;
1126
1127   if (escaped == NULL)
1128     return NULL;
1129
1130   if (len < 0)
1131     len = strlen (escaped);
1132
1133     result = g_malloc (len + 1);
1134
1135   out = result;
1136   for (in = escaped, in_end = escaped + len; in < in_end && *in != '\0'; in++)
1137     {
1138       character = *in;
1139       if (character == '%')
1140         {
1141           character = unescape_character (in + 1);
1142
1143           /* Check for an illegal character. We consider '\0' illegal here. */
1144           if (character == 0
1145               || (illegal_characters != NULL
1146                   && strchr (illegal_characters, (char)character) != NULL))
1147             {
1148               g_free (result);
1149               return NULL;
1150             }
1151           in += 2;
1152         }
1153       *out++ = character;
1154     }
1155
1156   *out = '\0';
1157
1158   g_assert (out - result <= strlen (escaped));
1159
1160   if (!g_utf8_validate (result, -1, NULL))
1161     {
1162       g_free (result);
1163       return NULL;
1164     }
1165
1166   return result;
1167 }
1168
1169 /**
1170  * g_filename_from_uri:
1171  * @uri: a uri describing a filename (escaped, encoded in UTF-8)
1172  * @hostname: Location to store hostname for the URI, or %NULL.
1173  *            If there is no hostname in the URI, %NULL will be
1174  *            stored in this location.
1175  * @error: location to store the error occuring, or %NULL to ignore
1176  *         errors. Any of the errors in #GConvertError may occur.
1177  *
1178  * Converts an escaped UTF-8 encoded URI to a local filename in the
1179  * encoding used for filenames.
1180  *
1181  * Return value: a newly allocated string holding the resulting
1182  *               filename, or %NULL on an error.
1183  **/
1184 gchar *
1185 g_filename_from_uri (const char *uri,
1186                      char      **hostname,
1187                      GError    **error)
1188 {
1189   const char *path_part;
1190   const char *host_part;
1191   char *unescaped_hostname;
1192   char *result;
1193   char *filename;
1194   int offs;
1195
1196   if (hostname)
1197     *hostname = NULL;
1198
1199   if (!has_case_prefix (uri, "file:/"))
1200     {
1201       g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_NOT_ABSOLUTE_FILE_URI,
1202                    _("The URI `%s' is not an absolute URI using the file scheme"),
1203                    uri);
1204       return NULL;
1205     }
1206
1207   path_part = uri + strlen ("file:");
1208
1209   if (strchr (path_part, '#') != NULL)
1210     {
1211       g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_INVALID_URI,
1212                    _("The local file URI `%s' may not include a `#'"),
1213                    uri);
1214       return NULL;
1215     }
1216
1217   if (has_case_prefix (path_part, "///"))
1218     path_part += 2;
1219   else if (has_case_prefix (path_part, "//"))
1220     {
1221       path_part += 2;
1222       host_part = path_part;
1223
1224       path_part = strchr (path_part, '/');
1225
1226       if (path_part == NULL)
1227         {
1228           g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_INVALID_URI,
1229                        _("The URI `%s' is invalid"),
1230                        uri);
1231           return NULL;
1232         }
1233
1234       unescaped_hostname = g_unescape_uri_string (host_part, "", path_part - host_part);
1235       if (unescaped_hostname == NULL)
1236         {
1237           g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_INVALID_URI,
1238                        _("The hostname of the URI `%s' contains invalidly escaped characters"),
1239                        uri);
1240           return NULL;
1241         }
1242
1243       if (hostname)
1244         *hostname = unescaped_hostname;
1245       else
1246         g_free (unescaped_hostname);
1247     }
1248
1249   filename = g_unescape_uri_string (path_part, "/", -1);
1250
1251   if (filename == NULL)
1252     {
1253       g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_INVALID_URI,
1254                    _("The URI `%s' contains invalidly escaped characters"),
1255                    uri);
1256       return NULL;
1257     }
1258
1259   /* DOS uri's are like "file://host/c:\foo", so we need to check if we need to
1260    * drop the initial slash */
1261   offs = 0;
1262   if (g_path_is_absolute (filename+1))
1263     offs = 1;
1264
1265   result = g_filename_from_utf8 (filename + offs, -1, NULL, NULL, error);
1266   g_free (filename);
1267
1268   return result;
1269 }
1270
1271 /**
1272  * g_filename_to_uri:
1273  * @filename: an absolute filename specified in the encoding
1274  *            used for filenames by the operating system.
1275  * @hostname: A UTF-8 encoded hostname, or %NULL for none.
1276  * @error: location to store the error occuring, or %NULL to ignore
1277  *         errors. Any of the errors in #GConvertError may occur.
1278  *
1279  * Converts an absolute filename to an escaped UTF-8 encoded URI.
1280  *
1281  * Return value: a newly allocated string holding the resulting
1282  *               URI, or %NULL on an error.
1283  **/
1284 gchar *
1285 g_filename_to_uri   (const char *filename,
1286                      char       *hostname,
1287                      GError    **error)
1288 {
1289   char *escaped_uri;
1290   char *utf8_filename;
1291
1292   g_return_val_if_fail (filename != NULL, NULL);
1293
1294   if (!g_path_is_absolute (filename))
1295     {
1296       g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_NOT_ABSOLUTE_PATH,
1297                    _("The pathname '%s' is not an absolute path"),
1298                    filename);
1299       return NULL;
1300     }
1301
1302   utf8_filename = g_filename_to_utf8 (filename, -1, NULL, NULL, error);
1303   if (utf8_filename == NULL)
1304     return NULL;
1305
1306   if (hostname &&
1307       !g_utf8_validate (hostname, -1, NULL))
1308     {
1309       g_free (utf8_filename);
1310       g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1311                    _("Invalid byte sequence in hostname"));
1312       return NULL;
1313     }
1314
1315   escaped_uri = g_escape_file_uri (hostname,
1316                                    utf8_filename);
1317   g_free (utf8_filename);
1318
1319   return escaped_uri;
1320 }
1321