gconvert.c

   1 /* GLIB - Library of useful routines for C programming
   2  *
   3  * gconvert.c: Convert between character sets using iconv
   4  * Copyright Red Hat Inc., 2000
   5  * Authors: Havoc Pennington <hp@redhat.com>, Owen Taylor <otaylor@redhat.com
   6  *
   7  * This library is free software; you can redistribute it and/or
   8  * modify it under the terms of the GNU Lesser General Public
   9  * License as published by the Free Software Foundation; either
  10  * version 2 of the License, or (at your option) any later version.
  11  *
  12  * This library is distributed in the hope that it will be useful,
  13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15  * Lesser General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU Lesser General Public
  18  * License along with this library; if not, write to the
  19  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  20  * Boston, MA 02111-1307, USA.
  21  */
  22
  23 #include <iconv.h>
  24 #include <errno.h>
  25 #include <string.h>
  26 #include <stdlib.h>
  27
  28 #include "glib.h"
  29 #include "config.h"
  30
  31 #ifdef G_OS_WIN32
  32 #include <windows.h>
  33 #endif
  34
  35 #define _(s) (s)
  36
  37 GQuark
  38 g_convert_error_quark()
  39 {
  40   static GQuark quark;
  41   if (!quark)
  42     quark = g_quark_from_static_string ("g_convert_error");
  43
  44   return quark;
  45 }
  46
  47 #if defined(USE_LIBICONV) && !defined (_LIBICONV_H)
  48 #error libiconv in use but included iconv.h not from libiconv
  49 #endif
  50 #if !defined(USE_LIBICONV) && defined (_LIBICONV_H)
  51 #error libiconv not in use but included iconv.h is from libiconv
  52 #endif
  53
  54 GIConv
  55 g_iconv_open (const gchar  *to_codeset,
  56               const gchar  *from_codeset)
  57 {
  58   iconv_t cd = iconv_open (to_codeset, from_codeset);
  59
  60   return (GIConv)cd;
  61 }
  62
  63 size_t
  64 g_iconv (GIConv   converter,
  65          gchar  **inbuf,
  66          size_t  *inbytes_left,
  67          gchar  **outbuf,
  68          size_t  *outbytes_left)
  69 {
  70   iconv_t cd = (iconv_t)converter;
  71
  72   return iconv (cd, inbuf, inbytes_left, outbuf, outbytes_left);
  73 }
  74
  75 gint
  76 g_iconv_close (GIConv converter)
  77 {
  78   iconv_t cd = (iconv_t)converter;
  79
  80   return iconv_close (cd);
  81 }
  82
  83 GIConv
  84 open_converter (const gchar *to_codeset,
  85                 const gchar *from_codeset,
  86                 GError     **error)
  87 {
  88   GIConv cd = g_iconv_open (to_codeset, from_codeset);
  89
  90   if (cd == (iconv_t) -1)
  91     {
  92       /* Something went wrong.  */
  93       if (errno == EINVAL)
  94         g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_NO_CONVERSION,
  95                      _("Conversion from character set `%s' to `%s' is not suppo\rted"),
  96                      from_codeset, to_codeset);
  97       else
  98         g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_FAILED,
  99                      _("Could not open converter from `%s' to `%s': %s"),
 100                      from_codeset, to_codeset, strerror (errno));
 101     }
 102
 103   return cd;
 104
 105 }
 106
 107 /**
 108  * g_convert:
 109  * @str:           the string to convert
 110  * @len:           the length of the string
 111  * @to_codeset:    name of character set into which to convert @str
 112  * @from_codeset:  character set of @str.
 113  * @bytes_read:    location to store the number of bytes in the
 114  *                 input string that were successfully converted, or %NULL.
 115  *                 Even if the conversion was succesful, this may be
 116  *                 less than len if there were partial characters
 117  *                 at the end of the input. If the error
 118  *                 G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value
 119  *                 stored will the byte fofset after the last valid
 120  *                 input sequence.
 121  * @bytes_written: the stored in the output buffer (not including the
 122  *                 terminating nul.
 123  * @error:         location to store the error occuring, or %NULL to ignore
 124  *                 errors. Any of the errors in #GConvertError may occur.
 125  *
 126  * Convert a string from one character set to another.
 127  *
 128  * Return value: If the conversion was successful, a newly allocated
 129  *               NUL-terminated string, which must be freed with
 130  *               g_free. Otherwise %NULL and @error will be set.
 131  **/
 132 gchar*
 133 g_convert (const gchar *str,
 134            gint         len,
 135            const gchar *to_codeset,
 136            const gchar *from_codeset,
 137            gint        *bytes_read,
 138            gint        *bytes_written,
 139            GError     **error)
 140 {
 141   gchar *dest;
 142   gchar *outp;
 143   const gchar *p;
 144   size_t inbytes_remaining;
 145   size_t outbytes_remaining;
 146   size_t err;
 147   GIConv cd;
 148   size_t outbuf_size;
 149   gboolean have_error = FALSE;
 150
 151   g_return_val_if_fail (str != NULL, NULL);
 152   g_return_val_if_fail (to_codeset != NULL, NULL);
 153   g_return_val_if_fail (from_codeset != NULL, NULL);
 154
 155   cd = open_converter (to_codeset, from_codeset, error);
 156
 157   if (cd == (GIConv) -1)
 158     {
 159       if (bytes_read)
 160         *bytes_read = 0;
 161
 162       if (bytes_written)
 163         *bytes_written = 0;
 164
 165       return NULL;
 166     }
 167
 168   if (len < 0)
 169     len = strlen (str);
 170
 171   p = str;
 172   inbytes_remaining = len;
 173   outbuf_size = len + 1; /* + 1 for nul in case len == 1 */
 174   outbytes_remaining = outbuf_size - 1; /* -1 for nul */
 175   outp = dest = g_malloc (outbuf_size);
 176
 177  again:
 178
 179   err = g_iconv (cd, (char **)&p, &inbytes_remaining, &outp, &outbytes_remaining);
 180
 181   if (err == (size_t) -1)
 182     {
 183       switch (errno)
 184         {
 185         case EINVAL:
 186           /* Incomplete text, do not report an error */
 187           break;
 188         case E2BIG:
 189           {
 190             size_t used = outp - dest;
 191             outbuf_size *= 2;
 192             dest = g_realloc (dest, outbuf_size);
 193
 194             outp = dest + used;
 195             outbytes_remaining = outbuf_size - used - 1; /* -1 for nul */
 196
 197             goto again;
 198           }
 199         case EILSEQ:
 200           g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
 201                        _("Invalid byte sequence in conversion input"));
 202           have_error = TRUE;
 203           break;
 204         default:
 205           g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_FAILED,
 206                        _("Error during conversion: %s"),
 207                        strerror (errno));
 208           have_error = TRUE;
 209           break;
 210         }
 211     }
 212
 213   *outp = '\0';
 214
 215   g_iconv_close (cd);
 216
 217   if (bytes_read)
 218     *bytes_read = p - str;
 219   else
 220     {
 221       if ((p - str) != len)
 222         {
 223           g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT,
 224                        _("Partial character sequence at end of input"));
 225           have_error = TRUE;
 226         }
 227     }
 228
 229   if (bytes_written)
 230     *bytes_written = outp - dest;       /* Doesn't include '\0' */
 231
 232   if (have_error)
 233     {
 234       g_free (dest);
 235       return NULL;
 236     }
 237   else
 238     return dest;
 239 }
 240
 241 /**
 242  * g_convert_with_fallback:
 243  * @str:          the string to convert
 244  * @len:          the length of the string
 245  * @to_codeset:   name of character set into which to convert @str
 246  * @from_codeset: character set of @str.
 247  * @fallback:     UTF-8 string to use in place of character not
 248  *                present in the target encoding. (This must be
 249  *                in the target encoding), if %NULL, characters
 250  *                not in the target encoding will be represented
 251  *                as Unicode escapes \x{XXXX} or \x{XXXXXX}.
 252  * @bytes_read:   location to store the number of bytes in the
 253  *                input string that were successfully converted, or %NULL.
 254  *                Even if the conversion was succesful, this may be
 255  *                less than len if there were partial characters
 256  *                at the end of the input. If the error
 257  *                G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value
 258  *                stored will the byte fofset after the last valid
 259  *                input sequence.
 260  * @bytes_written: the stored in the output buffer (not including the
 261  *                 terminating nul.
 262  * @error:        location to store the error occuring, or %NULL to ignore
 263  *                errors. Any of the errors in #GConvertError may occur.
 264  *
 265  * Convert a string from one character set to another, possibly
 266  * including fallback sequences for characters not representable
 267  * in the output. Note that it is not guaranteed that the specification
 268  * for the fallback sequences in @fallback will be honored. Some
 269  * systems may do a approximate conversion from @from_codeset
 270  * to @to_codeset in their iconv() functions, in which case GLib
 271  * will simply return that approximate conversion.
 272  *
 273  * Return value: If the conversion was successful, a newly allocated
 274  *               NUL-terminated string, which must be freed with
 275  *               g_free. Otherwise %NULL and @error will be set.
 276  **/
 277 gchar*
 278 g_convert_with_fallback (const gchar *str,
 279                          gint         len,
 280                          const gchar *to_codeset,
 281                          const gchar *from_codeset,
 282                          gchar       *fallback,
 283                          gint        *bytes_read,
 284                          gint        *bytes_written,
 285                          GError     **error)
 286 {
 287   gchar *utf8;
 288   gchar *dest;
 289   gchar *outp;
 290   const gchar *insert_str = NULL;
 291   const gchar *p;
 292   int inbytes_remaining;
 293   const gchar *save_p = NULL;
 294   size_t save_inbytes = 0;
 295   size_t outbytes_remaining;
 296   size_t err;
 297   GIConv cd;
 298   size_t outbuf_size;
 299   gboolean have_error = FALSE;
 300   gboolean done = FALSE;
 301
 302   GError *local_error = NULL;
 303
 304   g_return_val_if_fail (str != NULL, NULL);
 305   g_return_val_if_fail (to_codeset != NULL, NULL);
 306   g_return_val_if_fail (from_codeset != NULL, NULL);
 307
 308   if (len < 0)
 309     len = strlen (str);
 310
 311   /* Try an exact conversion; we only proceed if this fails
 312    * due to an illegal sequence in the input string.
 313    */
 314   dest = g_convert (str, len, to_codeset, from_codeset,
 315                     bytes_read, bytes_written, &local_error);
 316   if (!local_error)
 317     return dest;
 318
 319   if (!g_error_matches (local_error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE))
 320     {
 321       g_propagate_error (error, local_error);
 322       return NULL;
 323     }
 324   else
 325     g_error_free (local_error);
 326
 327   /* No go; to proceed, we need a converter from "UTF-8" to
 328    * to_codeset, and the string as UTF-8.
 329    */
 330   cd = open_converter (to_codeset, "UTF-8", error);
 331   if (cd == (GIConv) -1)
 332     {
 333       if (bytes_read)
 334         *bytes_read = 0;
 335
 336       if (bytes_written)
 337         *bytes_written = 0;
 338
 339       return NULL;
 340     }
 341
 342   utf8 = g_convert (str, len, "UTF-8", from_codeset,
 343                     bytes_read, &inbytes_remaining, error);
 344   if (!utf8)
 345     return NULL;
 346
 347   /* Now the heart of the code. We loop through the UTF-8 string, and
 348    * whenever we hit an offending character, we form fallback, convert
 349    * the fallback to the target codeset, and then go back to
 350    * converting the original string after finishing with the fallback.
 351    *
 352    * The variables save_p and save_inbytes store the input state
 353    * for the original string while we are converting the fallback
 354    */
 355   p = utf8;
 356   outbuf_size = len + 1; /* + 1 for nul in case len == 1 */
 357   outbytes_remaining = outbuf_size - 1; /* -1 for nul */
 358   outp = dest = g_malloc (outbuf_size);
 359
 360   while (!done && !have_error)
 361     {
 362       size_t inbytes_tmp = inbytes_remaining;
 363       err = g_iconv (cd, (char **)&p, &inbytes_tmp, &outp, &outbytes_remaining);
 364       inbytes_remaining = inbytes_tmp;
 365
 366       if (err == (size_t) -1)
 367         {
 368           switch (errno)
 369             {
 370             case EINVAL:
 371               g_assert_not_reached();
 372               break;
 373             case E2BIG:
 374               {
 375                 size_t used = outp - dest;
 376                 outbuf_size *= 2;
 377                 dest = g_realloc (dest, outbuf_size);
 378
 379                 outp = dest + used;
 380                 outbytes_remaining = outbuf_size - used - 1; /* -1 for nul */
 381
 382                 break;
 383               }
 384             case EILSEQ:
 385               if (save_p)
 386                 {
 387                   /* Error converting fallback string - fatal
 388                    */
 389                   g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
 390                                _("Cannot convert fallback '%s' to codeset '%s'"),
 391                                insert_str, to_codeset);
 392                   have_error = TRUE;
 393                   break;
 394                 }
 395               else
 396                 {
 397                   if (!fallback)
 398                     {
 399                       gunichar ch = g_utf8_get_char (p);
 400                       insert_str = g_strdup_printf ("\\x{%0*X}",
 401                                                     (ch < 0x10000) ? 4 : 6,
 402                                                     ch);
 403                     }
 404                   else
 405                     insert_str = fallback;
 406
 407                   save_p = g_utf8_next_char (p);
 408                   save_inbytes = inbytes_remaining - (save_p - p);
 409                   p = insert_str;
 410                   inbytes_remaining = strlen (p);
 411                 }
 412               break;
 413             default:
 414               g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_FAILED,
 415                            _("Error during conversion: %s"),
 416                            strerror (errno));
 417               have_error = TRUE;
 418               break;
 419             }
 420         }
 421       else
 422         {
 423           if (save_p)
 424             {
 425               if (!fallback)
 426                 g_free ((gchar *)insert_str);
 427               p = save_p;
 428               inbytes_remaining = save_inbytes;
 429               save_p = NULL;
 430             }
 431           else
 432             done = TRUE;
 433         }
 434     }
 435
 436   /* Cleanup
 437    */
 438   *outp = '\0';
 439
 440   g_iconv_close (cd);
 441
 442   if (bytes_written)
 443     *bytes_written = outp - str;        /* Doesn't include '\0' */
 444
 445   g_free (utf8);
 446
 447   if (have_error)
 448     {
 449       if (save_p && !fallback)
 450         g_free ((gchar *)insert_str);
 451       g_free (dest);
 452       return NULL;
 453     }
 454   else
 455     return dest;
 456 }
 457
 458 /*
 459  * g_locale_to_utf8
 460  *
 461  * Converts a string which is in the encoding used for strings by
 462  * the C runtime (usually the same as that used by the operating
 463  * system) in the current locale into a UTF-8 string.
 464  */
 465
 466 gchar *
 467 g_locale_to_utf8 (const gchar *opsysstring, GError **error)
 468 {
 469 #ifdef G_OS_WIN32
 470
 471   gint i, clen, wclen, first;
 472   const gint len = strlen (opsysstring);
 473   wchar_t *wcs, wc;
 474   gchar *result, *bp;
 475   const wchar_t *wcp;
 476
 477   wcs = g_new (wchar_t, len);
 478   wclen = MultiByteToWideChar (CP_ACP, 0, opsysstring, len, wcs, len);
 479
 480   wcp = wcs;
 481   clen = 0;
 482   for (i = 0; i < wclen; i++)
 483     {
 484       wc = *wcp++;
 485
 486       if (wc < 0x80)
 487         clen += 1;
 488       else if (wc < 0x800)
 489         clen += 2;
 490       else if (wc < 0x10000)
 491         clen += 3;
 492       else if (wc < 0x200000)
 493         clen += 4;
 494       else if (wc < 0x4000000)
 495         clen += 5;
 496       else
 497         clen += 6;
 498     }
 499
 500   result = g_malloc (clen + 1);
 501
 502   wcp = wcs;
 503   bp = result;
 504   for (i = 0; i < wclen; i++)
 505     {
 506       wc = *wcp++;
 507
 508       if (wc < 0x80)
 509         {
 510           first = 0;
 511           clen = 1;
 512         }
 513       else if (wc < 0x800)
 514         {
 515           first = 0xc0;
 516           clen = 2;
 517         }
 518       else if (wc < 0x10000)
 519         {
 520           first = 0xe0;
 521           clen = 3;
 522         }
 523       else if (wc < 0x200000)
 524         {
 525           first = 0xf0;
 526           clen = 4;
 527         }
 528       else if (wc < 0x4000000)
 529         {
 530           first = 0xf8;
 531           clen = 5;
 532         }
 533       else
 534         {
 535           first = 0xfc;
 536           clen = 6;
 537         }
 538
 539       /* Woo-hoo! */
 540       switch (clen)
 541         {
 542         case 6: bp[5] = (wc & 0x3f) | 0x80; wc >>= 6; /* Fall through */
 543         case 5: bp[4] = (wc & 0x3f) | 0x80; wc >>= 6; /* Fall through */
 544         case 4: bp[3] = (wc & 0x3f) | 0x80; wc >>= 6; /* Fall through */
 545         case 3: bp[2] = (wc & 0x3f) | 0x80; wc >>= 6; /* Fall through */
 546         case 2: bp[1] = (wc & 0x3f) | 0x80; wc >>= 6; /* Fall through */
 547         case 1: bp[0] = wc | first;
 548         }
 549
 550       bp += clen;
 551     }
 552   *bp = 0;
 553
 554   g_free (wcs);
 555
 556   return result;
 557
 558 #else
 559
 560   char *charset, *str;
 561
 562   if (g_get_charset (&charset))
 563     return g_strdup (opsysstring);
 564
 565   str = g_convert (opsysstring, strlen (opsysstring),
 566                    "UTF-8", charset, NULL, NULL, error);
 567
 568   return str;
 569 #endif
 570 }
 571
 572 /*
 573  * g_locale_from_utf8
 574  *
 575  * The reverse of g_locale_to_utf8.
 576  */
 577
 578 gchar *
 579 g_locale_from_utf8 (const gchar *utf8string, GError **error)
 580 {
 581 #ifdef G_OS_WIN32
 582
 583   gint i, mask, clen, mblen;
 584   const gint len = strlen (utf8string);
 585   wchar_t *wcs, *wcp;
 586   gchar *result;
 587   guchar *cp, *end, c;
 588   gint n;
 589
 590   /* First convert to wide chars */
 591   cp = (guchar *) utf8string;
 592   end = cp + len;
 593   n = 0;
 594   wcs = g_new (wchar_t, len + 1);
 595   wcp = wcs;
 596   while (cp != end)
 597     {
 598       mask = 0;
 599       c = *cp;
 600
 601       if (c < 0x80)
 602         {
 603           clen = 1;
 604           mask = 0x7f;
 605         }
 606       else if ((c & 0xe0) == 0xc0)
 607         {
 608           clen = 2;
 609           mask = 0x1f;
 610         }
 611       else if ((c & 0xf0) == 0xe0)
 612         {
 613           clen = 3;
 614           mask = 0x0f;
 615         }
 616       else if ((c & 0xf8) == 0xf0)
 617         {
 618           clen = 4;
 619           mask = 0x07;
 620         }
 621       else if ((c & 0xfc) == 0xf8)
 622         {
 623           clen = 5;
 624           mask = 0x03;
 625         }
 626       else if ((c & 0xfc) == 0xfc)
 627         {
 628           clen = 6;
 629           mask = 0x01;
 630         }
 631       else
 632         {
 633           g_free (wcs);
 634           return NULL;
 635         }
 636
 637       if (cp + clen > end)
 638         {
 639           g_free (wcs);
 640           return NULL;
 641         }
 642
 643       *wcp = (cp[0] & mask);
 644       for (i = 1; i < clen; i++)
 645         {
 646           if ((cp[i] & 0xc0) != 0x80)
 647             {
 648               g_free (wcs);
 649               return NULL;
 650             }
 651           *wcp <<= 6;
 652           *wcp |= (cp[i] & 0x3f);
 653         }
 654
 655       cp += clen;
 656       wcp++;
 657       n++;
 658     }
 659   if (cp != end)
 660     {
 661       g_free (wcs);
 662       return NULL;
 663     }
 664
 665   /* n is the number of wide chars constructed */
 666
 667   /* Convert to a string in the current ANSI codepage */
 668
 669   result = g_new (gchar, 3 * n + 1);
 670   mblen = WideCharToMultiByte (CP_ACP, 0, wcs, n, result, 3*n, NULL, NULL);
 671   result[mblen] = 0;
 672   g_free (wcs);
 673
 674   return result;
 675
 676 #else
 677
 678   gchar *charset, *str;
 679
 680   if (g_get_charset (&charset))
 681     return g_strdup (utf8string);
 682
 683   str = g_convert (utf8string, strlen (utf8string),
 684                    charset, "UTF-8", NULL, NULL, error);
 685
 686   return str;
 687
 688 #endif
 689 }
 690
 691 /* Filenames are in UTF-8 unless specificially requested otherwise */
 692
 693 gchar*
 694 g_filename_to_utf8 (const gchar *string, GError **error)
 695
 696 {
 697 #ifdef G_OS_WIN32
 698   return g_locale_to_utf8 (string, error);
 699 #else
 700   if (getenv ("G_BROKEN_FILENAMES"))
 701     return g_locale_to_utf8 (string, error);
 702
 703   return g_strdup (string);
 704 #endif
 705 }
 706
 707 gchar*
 708 g_filename_from_utf8 (const gchar *string, GError **error)
 709 {
 710 #ifdef G_OS_WIN32
 711   return g_locale_from_utf8 (string, error);
 712 #else
 713   if (getenv ("G_BROKEN_FILENAMES"))
 714     return g_locale_from_utf8 (string, error);
 715
 716   return g_strdup (string);
 717 #endif
 718 }
 719
 720