1 /* GLIB - Library of useful routines for C programming
3 * gconvert.c: Convert between character sets using iconv
4 * Copyright Red Hat Inc., 2000
5 * Authors: Havoc Pennington <hp@redhat.com>, Owen Taylor <otaylor@redhat.com
7 * This library is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2 of the License, or (at your option) any later version.
12 * This library is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with this library; if not, write to the
19 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
20 * Boston, MA 02111-1307, USA.
31 #ifdef G_PLATFORM_WIN32
40 g_convert_error_quark()
44 quark = g_quark_from_static_string ("g_convert_error");
49 #if defined(USE_LIBICONV) && !defined (_LIBICONV_H)
50 #error libiconv in use but included iconv.h not from libiconv
52 #if !defined(USE_LIBICONV) && defined (_LIBICONV_H)
53 #error libiconv not in use but included iconv.h is from libiconv
58 * @to_codeset: destination codeset
59 * @from_codeset: source codeset
61 * Same as the standard UNIX routine iconv_open(), but
62 * may be implemented via libiconv on UNIX flavors that lack
63 * a native implementation.
65 * GLib provides g_convert() and g_locale_to_utf8() which are likely
66 * more convenient than the raw iconv wrappers.
68 * Return value: a "conversion descriptor"
71 g_iconv_open (const gchar *to_codeset,
72 const gchar *from_codeset)
74 iconv_t cd = iconv_open (to_codeset, from_codeset);
81 * @converter: conversion descriptor from g_iconv_open()
82 * @inbuf: bytes to convert
83 * @inbytes_left: inout parameter, bytes remaining to convert in @inbuf
84 * @outbuf: converted output bytes
85 * @outbytes_left: inout parameter, bytes available to fill in @outbuf
87 * Same as the standard UNIX routine iconv(), but
88 * may be implemented via libiconv on UNIX flavors that lack
89 * a native implementation.
91 * GLib provides g_convert() and g_locale_to_utf8() which are likely
92 * more convenient than the raw iconv wrappers.
94 * Return value: count of non-reversible conversions, or -1 on error
97 g_iconv (GIConv converter,
101 gsize *outbytes_left)
103 iconv_t cd = (iconv_t)converter;
105 return iconv (cd, inbuf, inbytes_left, outbuf, outbytes_left);
110 * @converter: a conversion descriptor from g_iconv_open()
112 * Same as the standard UNIX routine iconv_close(), but
113 * may be implemented via libiconv on UNIX flavors that lack
114 * a native implementation. Should be called to clean up
115 * the conversion descriptor from iconv_open() when
116 * you are done converting things.
118 * GLib provides g_convert() and g_locale_to_utf8() which are likely
119 * more convenient than the raw iconv wrappers.
121 * Return value: -1 on error, 0 on success
124 g_iconv_close (GIConv converter)
126 iconv_t cd = (iconv_t)converter;
128 return iconv_close (cd);
132 open_converter (const gchar *to_codeset,
133 const gchar *from_codeset,
136 GIConv cd = g_iconv_open (to_codeset, from_codeset);
138 if (cd == (iconv_t) -1)
140 /* Something went wrong. */
142 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_NO_CONVERSION,
143 _("Conversion from character set `%s' to `%s' is not supported"),
144 from_codeset, to_codeset);
146 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_FAILED,
147 _("Could not open converter from `%s' to `%s': %s"),
148 from_codeset, to_codeset, strerror (errno));
157 * @str: the string to convert
158 * @len: the length of the string
159 * @to_codeset: name of character set into which to convert @str
160 * @from_codeset: character set of @str.
161 * @bytes_read: location to store the number of bytes in the
162 * input string that were successfully converted, or %NULL.
163 * Even if the conversion was succesful, this may be
164 * less than len if there were partial characters
165 * at the end of the input. If the error
166 * G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value
167 * stored will the byte fofset after the last valid
169 * @bytes_written: the stored in the output buffer (not including the
171 * @error: location to store the error occuring, or %NULL to ignore
172 * errors. Any of the errors in #GConvertError may occur.
174 * Convert a string from one character set to another.
176 * Return value: If the conversion was successful, a newly allocated
177 * NUL-terminated string, which must be freed with
178 * g_free. Otherwise %NULL and @error will be set.
181 g_convert (const gchar *str,
183 const gchar *to_codeset,
184 const gchar *from_codeset,
186 gsize *bytes_written,
192 gsize inbytes_remaining;
193 gsize outbytes_remaining;
197 gboolean have_error = FALSE;
199 g_return_val_if_fail (str != NULL, NULL);
200 g_return_val_if_fail (to_codeset != NULL, NULL);
201 g_return_val_if_fail (from_codeset != NULL, NULL);
203 cd = open_converter (to_codeset, from_codeset, error);
205 if (cd == (GIConv) -1)
220 inbytes_remaining = len;
221 outbuf_size = len + 1; /* + 1 for nul in case len == 1 */
223 outbytes_remaining = outbuf_size - 1; /* -1 for nul */
224 outp = dest = g_malloc (outbuf_size);
228 err = g_iconv (cd, (char **)&p, &inbytes_remaining, &outp, &outbytes_remaining);
230 if (err == (size_t) -1)
235 /* Incomplete text, do not report an error */
239 size_t used = outp - dest;
242 dest = g_realloc (dest, outbuf_size);
245 outbytes_remaining = outbuf_size - used - 1; /* -1 for nul */
250 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
251 _("Invalid byte sequence in conversion input"));
255 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_FAILED,
256 _("Error during conversion: %s"),
268 *bytes_read = p - str;
271 if ((p - str) != len)
275 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT,
276 _("Partial character sequence at end of input"));
283 *bytes_written = outp - dest; /* Doesn't include '\0' */
295 * g_convert_with_fallback:
296 * @str: the string to convert
297 * @len: the length of the string
298 * @to_codeset: name of character set into which to convert @str
299 * @from_codeset: character set of @str.
300 * @fallback: UTF-8 string to use in place of character not
301 * present in the target encoding. (This must be
302 * in the target encoding), if %NULL, characters
303 * not in the target encoding will be represented
304 * as Unicode escapes \x{XXXX} or \x{XXXXXX}.
305 * @bytes_read: location to store the number of bytes in the
306 * input string that were successfully converted, or %NULL.
307 * Even if the conversion was succesful, this may be
308 * less than len if there were partial characters
309 * at the end of the input.
310 * @bytes_written: the stored in the output buffer (not including the
312 * @error: location to store the error occuring, or %NULL to ignore
313 * errors. Any of the errors in #GConvertError may occur.
315 * Convert a string from one character set to another, possibly
316 * including fallback sequences for characters not representable
317 * in the output. Note that it is not guaranteed that the specification
318 * for the fallback sequences in @fallback will be honored. Some
319 * systems may do a approximate conversion from @from_codeset
320 * to @to_codeset in their iconv() functions, in which case GLib
321 * will simply return that approximate conversion.
323 * Return value: If the conversion was successful, a newly allocated
324 * NUL-terminated string, which must be freed with
325 * g_free. Otherwise %NULL and @error will be set.
328 g_convert_with_fallback (const gchar *str,
330 const gchar *to_codeset,
331 const gchar *from_codeset,
334 gsize *bytes_written,
340 const gchar *insert_str = NULL;
342 gsize inbytes_remaining;
343 const gchar *save_p = NULL;
344 gsize save_inbytes = 0;
345 gsize outbytes_remaining;
349 gboolean have_error = FALSE;
350 gboolean done = FALSE;
352 GError *local_error = NULL;
354 g_return_val_if_fail (str != NULL, NULL);
355 g_return_val_if_fail (to_codeset != NULL, NULL);
356 g_return_val_if_fail (from_codeset != NULL, NULL);
361 /* Try an exact conversion; we only proceed if this fails
362 * due to an illegal sequence in the input string.
364 dest = g_convert (str, len, to_codeset, from_codeset,
365 bytes_read, bytes_written, &local_error);
369 if (!g_error_matches (local_error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE))
371 g_propagate_error (error, local_error);
375 g_error_free (local_error);
379 /* No go; to proceed, we need a converter from "UTF-8" to
380 * to_codeset, and the string as UTF-8.
382 cd = open_converter (to_codeset, "UTF-8", error);
383 if (cd == (GIConv) -1)
394 utf8 = g_convert (str, len, "UTF-8", from_codeset,
395 bytes_read, &inbytes_remaining, error);
399 /* Now the heart of the code. We loop through the UTF-8 string, and
400 * whenever we hit an offending character, we form fallback, convert
401 * the fallback to the target codeset, and then go back to
402 * converting the original string after finishing with the fallback.
404 * The variables save_p and save_inbytes store the input state
405 * for the original string while we are converting the fallback
409 outbuf_size = len + 1; /* + 1 for nul in case len == 1 */
410 outbytes_remaining = outbuf_size - 1; /* -1 for nul */
411 outp = dest = g_malloc (outbuf_size);
413 while (!done && !have_error)
415 size_t inbytes_tmp = inbytes_remaining;
416 err = g_iconv (cd, (char **)&p, &inbytes_tmp, &outp, &outbytes_remaining);
417 inbytes_remaining = inbytes_tmp;
419 if (err == (size_t) -1)
424 g_assert_not_reached();
428 size_t used = outp - dest;
431 dest = g_realloc (dest, outbuf_size);
434 outbytes_remaining = outbuf_size - used - 1; /* -1 for nul */
441 /* Error converting fallback string - fatal
443 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
444 _("Cannot convert fallback '%s' to codeset '%s'"),
445 insert_str, to_codeset);
453 gunichar ch = g_utf8_get_char (p);
454 insert_str = g_strdup_printf ("\\x{%0*X}",
455 (ch < 0x10000) ? 4 : 6,
459 insert_str = fallback;
461 save_p = g_utf8_next_char (p);
462 save_inbytes = inbytes_remaining - (save_p - p);
464 inbytes_remaining = strlen (p);
468 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_FAILED,
469 _("Error during conversion: %s"),
480 g_free ((gchar *)insert_str);
482 inbytes_remaining = save_inbytes;
497 *bytes_written = outp - str; /* Doesn't include '\0' */
503 if (save_p && !fallback)
504 g_free ((gchar *)insert_str);
519 strdup_len (const gchar *string,
521 gsize *bytes_written,
528 real_len = strlen (string);
533 while (real_len < len && string[real_len])
538 *bytes_read = real_len;
540 *bytes_written = real_len;
542 return g_strndup (string, real_len);
547 * @opsysstring: a string in the encoding of the current locale
548 * @len: the length of the string, or -1 if the string is
550 * @bytes_read: location to store the number of bytes in the
551 * input string that were successfully converted, or %NULL.
552 * Even if the conversion was succesful, this may be
553 * less than len if there were partial characters
554 * at the end of the input. If the error
555 * G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value
556 * stored will the byte fofset after the last valid
558 * @bytes_written: the stored in the output buffer (not including the
560 * @error: location to store the error occuring, or %NULL to ignore
561 * errors. Any of the errors in #GConvertError may occur.
563 * Converts a string which is in the encoding used for strings by
564 * the C runtime (usually the same as that used by the operating
565 * system) in the current locale into a UTF-8 string.
567 * Return value: The converted string, or %NULL on an error.
570 g_locale_to_utf8 (const gchar *opsysstring,
573 gsize *bytes_written,
576 #ifdef G_PLATFORM_WIN32
578 gint i, clen, total_len, wclen, first;
584 len = strlen (opsysstring);
586 wcs = g_new (wchar_t, len);
587 wclen = MultiByteToWideChar (CP_ACP, 0, opsysstring, len, wcs, len);
591 for (i = 0; i < wclen; i++)
599 else if (wc < 0x10000)
601 else if (wc < 0x200000)
603 else if (wc < 0x4000000)
609 result = g_malloc (total_len + 1);
613 for (i = 0; i < wclen; i++)
627 else if (wc < 0x10000)
632 else if (wc < 0x200000)
637 else if (wc < 0x4000000)
651 case 6: bp[5] = (wc & 0x3f) | 0x80; wc >>= 6; /* Fall through */
652 case 5: bp[4] = (wc & 0x3f) | 0x80; wc >>= 6; /* Fall through */
653 case 4: bp[3] = (wc & 0x3f) | 0x80; wc >>= 6; /* Fall through */
654 case 3: bp[2] = (wc & 0x3f) | 0x80; wc >>= 6; /* Fall through */
655 case 2: bp[1] = (wc & 0x3f) | 0x80; wc >>= 6; /* Fall through */
656 case 1: bp[0] = wc | first;
668 *bytes_written = total_len;
672 #else /* !G_PLATFORM_WIN32 */
676 if (g_get_charset (&charset))
677 return strdup_len (opsysstring, len, bytes_read, bytes_written);
679 return g_convert (opsysstring, len,
680 "UTF-8", charset, bytes_read, bytes_written, error);
682 #endif /* !G_PLATFORM_WIN32 */
686 * g_locale_from_utf8:
687 * @utf8string: a UTF-8 encoded string
688 * @len: the length of the string, or -1 if the string is
690 * @bytes_read: location to store the number of bytes in the
691 * input string that were successfully converted, or %NULL.
692 * Even if the conversion was succesful, this may be
693 * less than len if there were partial characters
694 * at the end of the input. If the error
695 * G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value
696 * stored will the byte fofset after the last valid
698 * @bytes_written: the stored in the output buffer (not including the
700 * @error: location to store the error occuring, or %NULL to ignore
701 * errors. Any of the errors in #GConvertError may occur.
703 * Converts a string from UTF-8 to the encoding used for strings by
704 * the C runtime (usually the same as that used by the operating
705 * system) in the current locale.
707 * Return value: The converted string, or %NULL on an error.
710 g_locale_from_utf8 (const gchar *utf8string,
713 gsize *bytes_written,
716 #ifdef G_PLATFORM_WIN32
718 gint i, mask, clen, mblen;
725 len = strlen (utf8string);
727 /* First convert to wide chars */
728 cp = (guchar *) utf8string;
731 wcs = g_new (wchar_t, len + 1);
743 else if ((c & 0xe0) == 0xc0)
748 else if ((c & 0xf0) == 0xe0)
753 else if ((c & 0xf8) == 0xf0)
758 else if ((c & 0xfc) == 0xf8)
763 else if ((c & 0xfc) == 0xfc)
780 *wcp = (cp[0] & mask);
781 for (i = 1; i < clen; i++)
783 if ((cp[i] & 0xc0) != 0x80)
789 *wcp |= (cp[i] & 0x3f);
802 /* n is the number of wide chars constructed */
804 /* Convert to a string in the current ANSI codepage */
806 result = g_new (gchar, 3 * n + 1);
807 mblen = WideCharToMultiByte (CP_ACP, 0, wcs, n, result, 3*n, NULL, NULL);
814 *bytes_written = mblen;
818 #else /* !G_PLATFORM_WIN32 */
820 const gchar *charset;
822 if (g_get_charset (&charset))
823 return strdup_len (utf8string, len, bytes_read, bytes_written);
825 return g_convert (utf8string, len,
826 charset, "UTF-8", bytes_read, bytes_written, error);
828 #endif /* !G_PLATFORM_WIN32 */
832 * g_filename_to_utf8:
833 * @opsysstring: a string in the encoding for filenames
834 * @len: the length of the string, or -1 if the string is
836 * @bytes_read: location to store the number of bytes in the
837 * input string that were successfully converted, or %NULL.
838 * Even if the conversion was succesful, this may be
839 * less than len if there were partial characters
840 * at the end of the input. If the error
841 * G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value
842 * stored will the byte fofset after the last valid
844 * @bytes_written: the stored in the output buffer (not including the
846 * @error: location to store the error occuring, or %NULL to ignore
847 * errors. Any of the errors in #GConvertError may occur.
849 * Converts a string which is in the encoding used for filenames
850 * into a UTF-8 string.
852 * Return value: The converted string, or %NULL on an error.
855 g_filename_to_utf8 (const gchar *opsysstring,
858 gsize *bytes_written,
861 #ifdef G_PLATFORM_WIN32
862 return g_locale_to_utf8 (opsysstring, len,
863 bytes_read, bytes_written,
865 #else /* !G_PLATFORM_WIN32 */
866 if (getenv ("G_BROKEN_FILENAMES"))
867 return g_locale_to_utf8 (opsysstring, len,
868 bytes_read, bytes_written,
871 return strdup_len (opsysstring, len, bytes_read, bytes_written);
872 #endif /* !G_PLATFORM_WIN32 */
876 * g_filename_from_utf8:
877 * @utf8string: a UTF-8 encoded string
878 * @len: the length of the string, or -1 if the string is
880 * @bytes_read: location to store the number of bytes in the
881 * input string that were successfully converted, or %NULL.
882 * Even if the conversion was succesful, this may be
883 * less than len if there were partial characters
884 * at the end of the input. If the error
885 * G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value
886 * stored will the byte fofset after the last valid
888 * @bytes_written: the stored in the output buffer (not including the
890 * @error: location to store the error occuring, or %NULL to ignore
891 * errors. Any of the errors in #GConvertError may occur.
893 * Converts a string from UTF-8 to the encoding used for filenames.
895 * Return value: The converted string, or %NULL on an error.
898 g_filename_from_utf8 (const gchar *utf8string,
901 gsize *bytes_written,
904 #ifdef G_PLATFORM_WIN32
905 return g_locale_from_utf8 (utf8string, len,
906 bytes_read, bytes_written,
908 #else /* !G_PLATFORM_WIN32 */
909 if (getenv ("G_BROKEN_FILENAMES"))
910 return g_locale_from_utf8 (utf8string, len,
911 bytes_read, bytes_written,
914 return strdup_len (utf8string, len, bytes_read, bytes_written);
915 #endif /* !G_PLATFORM_WIN32 */