glib/gconvert.c

   1 /* GLIB - Library of useful routines for C programming
   2  *
   3  * gconvert.c: Convert between character sets using iconv
   4  * Copyright Red Hat Inc., 2000
   5  * Authors: Havoc Pennington <hp@redhat.com>, Owen Taylor <otaylor@redhat.com
   6  *
   7  * This library is free software; you can redistribute it and/or
   8  * modify it under the terms of the GNU Lesser General Public
   9  * License as published by the Free Software Foundation; either
  10  * version 2 of the License, or (at your option) any later version.
  11  *
  12  * This library is distributed in the hope that it will be useful,
  13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15  * Lesser General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU Lesser General Public
  18  * License along with this library; if not, write to the
  19  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  20  * Boston, MA 02111-1307, USA.
  21  */
  22
  23 #include <iconv.h>
  24 #include <errno.h>
  25 #include <string.h>
  26
  27 #include "glib.h"
  28
  29 GQuark
  30 g_convert_error_quark()
  31 {
  32   static GQuark quark;
  33   if (!quark)
  34     quark = g_quark_from_static_string ("g_convert_error");
  35   return quark;
  36 }
  37
  38 static iconv_t
  39 open_converter (const gchar *to_codeset,
  40                 const gchar *from_codeset,
  41                 GError     **error)
  42 {
  43   iconv_t cd = iconv_open (to_codeset, from_codeset);
  44
  45   if (cd == (iconv_t) -1)
  46     {
  47       /* Something went wrong.  */
  48       if (errno == EINVAL)
  49         g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_NO_CONVERSION,
  50                      "Conversion from character set `%s' to `%s' is not supported",
  51                      from_codeset, to_codeset);
  52       else
  53         g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_OTHER,
  54                      "Could not open converter from `%s' to `%s': %s",
  55                      from_codeset, to_codeset, strerror (errno));
  56     }
  57
  58   return cd;
  59
  60 }
  61
  62 /**
  63  * g_convert:
  64  * @str:          the string to convert
  65  * @len:          the length of the string
  66  * @to_codeset:   name of character set into which to convert @str
  67  * @from_codeset: character set of @str.
  68  * @bytes_read:   location to store the number of bytes in the
  69  *                input string that were successfully converted, or %NULL.
  70  *                Even if the conversion was succesful, this may be
  71  *                less than len if there were partial characters
  72  *                at the end of the input. If the error
  73  *                G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value
  74  *                stored will the byte fofset after the last valid
  75  *                input sequence.
  76  * @error:        location to store the error occuring, or %NULL to ignore
  77  *                errors. Any of the errors in #GConvertError may occur.
  78  *
  79  * Convert a string from one character set to another.
  80  *
  81  * Return value: If the conversion was successful, a newly allocated
  82  *               NUL-terminated string, which must be freed with
  83  *               g_free. Otherwise %NULL and @error will be set.
  84  **/
  85 gchar*
  86 g_convert (const gchar *str,
  87            gint         len,
  88            const gchar *to_codeset,
  89            const gchar *from_codeset,
  90            gint        *bytes_read,
  91            gint        *bytes_written,
  92            GError     **error)
  93 {
  94   gchar *dest;
  95   gchar *outp;
  96   const gchar *p;
  97   size_t inbytes_remaining;
  98   size_t outbytes_remaining;
  99   size_t err;
 100   iconv_t cd;
 101   size_t outbuf_size;
 102   gboolean have_error = FALSE;
 103
 104   g_return_val_if_fail (str != NULL, NULL);
 105   g_return_val_if_fail (to_codeset != NULL, NULL);
 106   g_return_val_if_fail (from_codeset != NULL, NULL);
 107
 108   cd = open_converter (to_codeset, from_codeset, error);
 109
 110   if (cd == (iconv_t) -1)
 111     {
 112       if (bytes_read)
 113         *bytes_read = 0;
 114
 115       if (bytes_written)
 116         *bytes_written = 0;
 117
 118       return NULL;
 119     }
 120
 121   if (len < 0)
 122     len = strlen (str);
 123
 124   p = str;
 125   inbytes_remaining = len;
 126   outbuf_size = len + 1; /* + 1 for nul in case len == 1 */
 127   outbytes_remaining = outbuf_size - 1; /* -1 for nul */
 128   outp = dest = g_malloc (outbuf_size);
 129
 130  again:
 131
 132   err = iconv (cd, &p, &inbytes_remaining, &outp, &outbytes_remaining);
 133
 134   if (err == (size_t) -1)
 135     {
 136       switch (errno)
 137         {
 138         case EINVAL:
 139           /* Incomplete text, do not report an error */
 140           break;
 141         case E2BIG:
 142           {
 143             size_t used = outp - dest;
 144             outbuf_size *= 2;
 145             dest = g_realloc (dest, outbuf_size);
 146
 147             outp = dest + used;
 148             outbytes_remaining = outbuf_size - used - 1; /* -1 for nul */
 149
 150             goto again;
 151           }
 152         case EILSEQ:
 153           g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
 154                        "Invalid byte sequence in conversion input");
 155           have_error = TRUE;
 156           break;
 157         default:
 158           g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_OTHER,
 159                        "Error during conversion: %s",
 160                        strerror (errno));
 161           have_error = TRUE;
 162           break;
 163         }
 164     }
 165
 166   *outp = '\0';
 167
 168   iconv_close (cd);
 169
 170   if (bytes_read)
 171     *bytes_read = p - str;
 172
 173   if (bytes_written)
 174     *bytes_written = outp - dest;       /* Doesn't include '\0' */
 175
 176   if (have_error)
 177     {
 178       g_free (dest);
 179       return NULL;
 180     }
 181   else
 182     return dest;
 183 }
 184
 185 /**
 186  * g_convert_with_fallback:
 187  * @str:          the string to convert
 188  * @len:          the length of the string
 189  * @to_codeset:   name of character set into which to convert @str
 190  * @from_codeset: character set of @str.
 191  * @fallback:     UTF-8 string to use in place of character not
 192  *                present in the target encoding. (This must be
 193  *                in the target encoding), if %NULL, characters
 194  *                not in the target encoding will be represented
 195  *                as Unicode escapes \x{XXXX} or \x{XXXXXX}.
 196  * @bytes_read:   location to store the number of bytes in the
 197  *                input string that were successfully converted, or %NULL.
 198  *                Even if the conversion was succesful, this may be
 199  *                less than len if there were partial characters
 200  *                at the end of the input. If the error
 201  *                G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value
 202  *                stored will the byte fofset after the last valid
 203  *                input sequence.
 204  * @error:        location to store the error occuring, or %NULL to ignore
 205  *                errors. Any of the errors in #GConvertError may occur.
 206  *
 207  * Convert a string from one character set to another, possibly
 208  * including fallback sequences for characters not representable
 209  * in the output. Note that it is not guaranteed that the specification
 210  * for the fallback sequences in @fallback will be honored. Some
 211  * systems may do a approximate conversion from @from_codeset
 212  * to @to_codeset in their iconv() functions, in which case GLib
 213  * will simply return that approximate conversion.
 214  *
 215  * Return value: If the conversion was successful, a newly allocated
 216  *               NUL-terminated string, which must be freed with
 217  *               g_free. Otherwise %NULL and @error will be set.
 218  **/
 219 gchar*
 220 g_convert_with_fallback (const gchar *str,
 221                          gint         len,
 222                          const gchar *to_codeset,
 223                          const gchar *from_codeset,
 224                          gchar       *fallback,
 225                          gint        *bytes_read,
 226                          gint        *bytes_written,
 227                          GError     **error)
 228 {
 229   gchar *utf8;
 230   gchar *dest;
 231   gchar *outp;
 232   const gchar *insert_str = NULL;
 233   const gchar *p;
 234   size_t inbytes_remaining;
 235   const gchar *save_p = NULL;
 236   size_t save_inbytes = 0;
 237   size_t outbytes_remaining;
 238   size_t err;
 239   iconv_t cd;
 240   size_t outbuf_size;
 241   gboolean have_error = FALSE;
 242   gboolean done = FALSE;
 243
 244   GError *local_error = NULL;
 245
 246   g_return_val_if_fail (str != NULL, NULL);
 247   g_return_val_if_fail (to_codeset != NULL, NULL);
 248   g_return_val_if_fail (from_codeset != NULL, NULL);
 249
 250   if (len < 0)
 251     len = strlen (str);
 252
 253   /* Try an exact conversion; we only proceed if this fails
 254    * due to an illegal sequence in the input string.
 255    */
 256   dest = g_convert (str, len, to_codeset, from_codeset,
 257                     bytes_read, bytes_written, &local_error);
 258   if (!local_error)
 259     return dest;
 260
 261   if (!g_error_matches (local_error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE))
 262     {
 263       g_propagate_error (error, local_error);
 264       return NULL;
 265     }
 266   else
 267     g_error_free (local_error);
 268
 269   /* No go; to proceed, we need a converter from "UTF-8" to
 270    * to_codeset, and the string as UTF-8.
 271    */
 272   cd = open_converter (to_codeset, "UTF-8", error);
 273   if (cd == (iconv_t) -1)
 274     {
 275       if (bytes_read)
 276         *bytes_read = 0;
 277
 278       if (bytes_written)
 279         *bytes_written = 0;
 280
 281       return NULL;
 282     }
 283
 284   utf8 = g_convert (str, len, "UTF-8", from_codeset,
 285                     bytes_read, &inbytes_remaining, error);
 286   if (!utf8)
 287     return NULL;
 288
 289   /* Now the heart of the code. We loop through the UTF-8 string, and
 290    * whenever we hit an offending character, we form fallback, convert
 291    * the fallback to the target codeset, and then go back to
 292    * converting the original string after finishing with the fallback.
 293    *
 294    * The variables save_p and save_inbytes store the input state
 295    * for the original string while we are converting the fallback
 296    */
 297   p = utf8;
 298   outbuf_size = len + 1; /* + 1 for nul in case len == 1 */
 299   outbytes_remaining = outbuf_size - 1; /* -1 for nul */
 300   outp = dest = g_malloc (outbuf_size);
 301
 302   while (!done && !have_error)
 303     {
 304       err = iconv (cd, &p, &inbytes_remaining, &outp, &outbytes_remaining);
 305
 306       if (err == (size_t) -1)
 307         {
 308           switch (errno)
 309             {
 310             case EINVAL:
 311               g_assert_not_reached();
 312               break;
 313             case E2BIG:
 314               {
 315                 size_t used = outp - dest;
 316                 outbuf_size *= 2;
 317                 dest = g_realloc (dest, outbuf_size);
 318
 319                 outp = dest + used;
 320                 outbytes_remaining = outbuf_size - used - 1; /* -1 for nul */
 321
 322                 break;
 323               }
 324             case EILSEQ:
 325               if (save_p)
 326                 {
 327                   /* Error converting fallback string - fatal
 328                    */
 329                   g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
 330                                "Cannot convert fallback '%s' to codeset '%s'",
 331                                insert_str, to_codeset);
 332                   have_error = TRUE;
 333                   break;
 334                 }
 335               else
 336                 {
 337                   if (!fallback)
 338                     {
 339                       gunichar ch = g_utf8_get_char (p);
 340                       insert_str = g_strdup_printf ("\\x{%0*X}",
 341                                                     (ch < 0x10000) ? 4 : 6,
 342                                                     ch);
 343                     }
 344                   else
 345                     insert_str = fallback;
 346
 347                   save_p = g_utf8_next_char (p);
 348                   save_inbytes = inbytes_remaining - (save_p - p);
 349                   p = insert_str;
 350                   inbytes_remaining = strlen (p);
 351                 }
 352               break;
 353             default:
 354               g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_OTHER,
 355                            "Error during conversion: %s",
 356                            strerror (errno));
 357               have_error = TRUE;
 358               break;
 359             }
 360         }
 361       else
 362         {
 363           if (save_p)
 364             {
 365               if (!fallback)
 366                 g_free ((gchar *)insert_str);
 367               p = save_p;
 368               inbytes_remaining = save_inbytes;
 369               save_p = NULL;
 370             }
 371           else
 372             done = TRUE;
 373         }
 374     }
 375
 376   /* Cleanup
 377    */
 378   *outp = '\0';
 379
 380   iconv_close (cd);
 381
 382   if (bytes_written)
 383     *bytes_written = outp - str;        /* Doesn't include '\0' */
 384
 385   g_free (utf8);
 386
 387   if (have_error)
 388     {
 389       if (save_p && !fallback)
 390         g_free ((gchar *)insert_str);
 391       g_free (dest);
 392       return NULL;
 393     }
 394   else
 395     return dest;
 396 }