From a212f93df26c24dc724905555b9525fe455be863 Mon Sep 17 00:00:00 2001 From: Owen Taylor Date: Mon, 11 Sep 2000 03:03:18 +0000 Subject: [PATCH] Havoc Pennington's implementation of convenient character set conversion Sun Sep 10 12:37:40 2000 Owen Taylor * glib.h gconvert.c (g_convert): Havoc Pennington's implementation of convenient character set conversion using iconv, with the addition of GError. We probably need a fallback that just does conversions between, say UTF-8,16,32 and ISO-8859-1 for targets without iconv at all. Also add g_convert_with_fallback() to take care of conversions where we accept some loss going to the target encoding. --- ChangeLog | 11 ++ ChangeLog.pre-2-0 | 11 ++ ChangeLog.pre-2-10 | 11 ++ ChangeLog.pre-2-12 | 11 ++ ChangeLog.pre-2-2 | 11 ++ ChangeLog.pre-2-4 | 11 ++ ChangeLog.pre-2-6 | 11 ++ ChangeLog.pre-2-8 | 11 ++ Makefile.am | 1 + gconvert.c | 396 +++++++++++++++++++++++++++++++++++++++++++++++++++++ glib.h | 26 ++++ glib/Makefile.am | 1 + glib/gconvert.c | 396 +++++++++++++++++++++++++++++++++++++++++++++++++++++ glib/glib.h | 26 ++++ 14 files changed, 934 insertions(+) create mode 100644 gconvert.c create mode 100644 glib/gconvert.c diff --git a/ChangeLog b/ChangeLog index 1abdfe7..d2dc8c5 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,14 @@ +Sun Sep 10 12:37:40 2000 Owen Taylor + + * glib.h gconvert.c (g_convert): Havoc Pennington's implementation + of convenient character set conversion using iconv, with + the addition of GError. We probably need a fallback that + just does conversions between, say UTF-8,16,32 and ISO-8859-1 + for targets without iconv at all. + + Also add g_convert_with_fallback() to take care of conversions + where we accept some loss going to the target encoding. + 2000-09-10 Havoc Pennington * gutf8.c (g_utf8_validate): Add this function. diff --git a/ChangeLog.pre-2-0 b/ChangeLog.pre-2-0 index 1abdfe7..d2dc8c5 100644 --- a/ChangeLog.pre-2-0 +++ b/ChangeLog.pre-2-0 @@ -1,3 +1,14 @@ +Sun Sep 10 12:37:40 2000 Owen Taylor + + * glib.h gconvert.c (g_convert): Havoc Pennington's implementation + of convenient character set conversion using iconv, with + the addition of GError. We probably need a fallback that + just does conversions between, say UTF-8,16,32 and ISO-8859-1 + for targets without iconv at all. + + Also add g_convert_with_fallback() to take care of conversions + where we accept some loss going to the target encoding. + 2000-09-10 Havoc Pennington * gutf8.c (g_utf8_validate): Add this function. diff --git a/ChangeLog.pre-2-10 b/ChangeLog.pre-2-10 index 1abdfe7..d2dc8c5 100644 --- a/ChangeLog.pre-2-10 +++ b/ChangeLog.pre-2-10 @@ -1,3 +1,14 @@ +Sun Sep 10 12:37:40 2000 Owen Taylor + + * glib.h gconvert.c (g_convert): Havoc Pennington's implementation + of convenient character set conversion using iconv, with + the addition of GError. We probably need a fallback that + just does conversions between, say UTF-8,16,32 and ISO-8859-1 + for targets without iconv at all. + + Also add g_convert_with_fallback() to take care of conversions + where we accept some loss going to the target encoding. + 2000-09-10 Havoc Pennington * gutf8.c (g_utf8_validate): Add this function. diff --git a/ChangeLog.pre-2-12 b/ChangeLog.pre-2-12 index 1abdfe7..d2dc8c5 100644 --- a/ChangeLog.pre-2-12 +++ b/ChangeLog.pre-2-12 @@ -1,3 +1,14 @@ +Sun Sep 10 12:37:40 2000 Owen Taylor + + * glib.h gconvert.c (g_convert): Havoc Pennington's implementation + of convenient character set conversion using iconv, with + the addition of GError. We probably need a fallback that + just does conversions between, say UTF-8,16,32 and ISO-8859-1 + for targets without iconv at all. + + Also add g_convert_with_fallback() to take care of conversions + where we accept some loss going to the target encoding. + 2000-09-10 Havoc Pennington * gutf8.c (g_utf8_validate): Add this function. diff --git a/ChangeLog.pre-2-2 b/ChangeLog.pre-2-2 index 1abdfe7..d2dc8c5 100644 --- a/ChangeLog.pre-2-2 +++ b/ChangeLog.pre-2-2 @@ -1,3 +1,14 @@ +Sun Sep 10 12:37:40 2000 Owen Taylor + + * glib.h gconvert.c (g_convert): Havoc Pennington's implementation + of convenient character set conversion using iconv, with + the addition of GError. We probably need a fallback that + just does conversions between, say UTF-8,16,32 and ISO-8859-1 + for targets without iconv at all. + + Also add g_convert_with_fallback() to take care of conversions + where we accept some loss going to the target encoding. + 2000-09-10 Havoc Pennington * gutf8.c (g_utf8_validate): Add this function. diff --git a/ChangeLog.pre-2-4 b/ChangeLog.pre-2-4 index 1abdfe7..d2dc8c5 100644 --- a/ChangeLog.pre-2-4 +++ b/ChangeLog.pre-2-4 @@ -1,3 +1,14 @@ +Sun Sep 10 12:37:40 2000 Owen Taylor + + * glib.h gconvert.c (g_convert): Havoc Pennington's implementation + of convenient character set conversion using iconv, with + the addition of GError. We probably need a fallback that + just does conversions between, say UTF-8,16,32 and ISO-8859-1 + for targets without iconv at all. + + Also add g_convert_with_fallback() to take care of conversions + where we accept some loss going to the target encoding. + 2000-09-10 Havoc Pennington * gutf8.c (g_utf8_validate): Add this function. diff --git a/ChangeLog.pre-2-6 b/ChangeLog.pre-2-6 index 1abdfe7..d2dc8c5 100644 --- a/ChangeLog.pre-2-6 +++ b/ChangeLog.pre-2-6 @@ -1,3 +1,14 @@ +Sun Sep 10 12:37:40 2000 Owen Taylor + + * glib.h gconvert.c (g_convert): Havoc Pennington's implementation + of convenient character set conversion using iconv, with + the addition of GError. We probably need a fallback that + just does conversions between, say UTF-8,16,32 and ISO-8859-1 + for targets without iconv at all. + + Also add g_convert_with_fallback() to take care of conversions + where we accept some loss going to the target encoding. + 2000-09-10 Havoc Pennington * gutf8.c (g_utf8_validate): Add this function. diff --git a/ChangeLog.pre-2-8 b/ChangeLog.pre-2-8 index 1abdfe7..d2dc8c5 100644 --- a/ChangeLog.pre-2-8 +++ b/ChangeLog.pre-2-8 @@ -1,3 +1,14 @@ +Sun Sep 10 12:37:40 2000 Owen Taylor + + * glib.h gconvert.c (g_convert): Havoc Pennington's implementation + of convenient character set conversion using iconv, with + the addition of GError. We probably need a fallback that + just does conversions between, say UTF-8,16,32 and ISO-8859-1 + for targets without iconv at all. + + Also add g_convert_with_fallback() to take care of conversions + where we accept some loss going to the target encoding. + 2000-09-10 Havoc Pennington * gutf8.c (g_utf8_validate): Add this function. diff --git a/Makefile.am b/Makefile.am index 9f73023..ff411f4 100644 --- a/Makefile.am +++ b/Makefile.am @@ -45,6 +45,7 @@ libglib_1_3_la_SOURCES = \ gbacktrace.c \ gcache.c \ gcompletion.c \ + gconvert.c \ gdataset.c \ gdate.c \ gerror.c \ diff --git a/gconvert.c b/gconvert.c new file mode 100644 index 0000000..b1dfeab --- /dev/null +++ b/gconvert.c @@ -0,0 +1,396 @@ +/* GLIB - Library of useful routines for C programming + * + * gconvert.c: Convert between character sets using iconv + * Copyright Red Hat Inc., 2000 + * Authors: Havoc Pennington , Owen Taylor +#include +#include + +#include "glib.h" + +GQuark +g_convert_error_quark() +{ + static GQuark quark; + if (!quark) + quark = g_quark_from_static_string ("g_convert_error"); + return quark; +} + +static iconv_t +open_converter (const gchar *to_codeset, + const gchar *from_codeset, + GError **error) +{ + iconv_t cd = iconv_open (to_codeset, from_codeset); + + if (cd == (iconv_t) -1) + { + /* Something went wrong. */ + if (errno == EINVAL) + g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_NO_CONVERSION, + "Conversion from character set `%s' to `%s' is not supported", + from_codeset, to_codeset); + else + g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_OTHER, + "Could not open converter from `%s' to `%s': %s", + from_codeset, to_codeset, strerror (errno)); + } + + return cd; + +} + +/** + * g_convert: + * @str: the string to convert + * @len: the length of the string + * @to_codeset: name of character set into which to convert @str + * @from_codeset: character set of @str. + * @bytes_read: location to store the number of bytes in the + * input string that were successfully converted, or %NULL. + * Even if the conversion was succesful, this may be + * less than len if there were partial characters + * at the end of the input. If the error + * G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value + * stored will the byte fofset after the last valid + * input sequence. + * @error: location to store the error occuring, or %NULL to ignore + * errors. Any of the errors in #GConvertError may occur. + * + * Convert a string from one character set to another. + * + * Return value: If the conversion was successful, a newly allocated + * NUL-terminated string, which must be freed with + * g_free. Otherwise %NULL and @error will be set. + **/ +gchar* +g_convert (const gchar *str, + gint len, + const gchar *to_codeset, + const gchar *from_codeset, + gint *bytes_read, + gint *bytes_written, + GError **error) +{ + gchar *dest; + gchar *outp; + const gchar *p; + size_t inbytes_remaining; + size_t outbytes_remaining; + size_t err; + iconv_t cd; + size_t outbuf_size; + gboolean have_error = FALSE; + + g_return_val_if_fail (str != NULL, NULL); + g_return_val_if_fail (to_codeset != NULL, NULL); + g_return_val_if_fail (from_codeset != NULL, NULL); + + cd = open_converter (to_codeset, from_codeset, error); + + if (cd == (iconv_t) -1) + { + if (bytes_read) + *bytes_read = 0; + + if (bytes_written) + *bytes_written = 0; + + return NULL; + } + + if (len < 0) + len = strlen (str); + + p = str; + inbytes_remaining = len; + outbuf_size = len + 1; /* + 1 for nul in case len == 1 */ + outbytes_remaining = outbuf_size - 1; /* -1 for nul */ + outp = dest = g_malloc (outbuf_size); + + again: + + err = iconv (cd, &p, &inbytes_remaining, &outp, &outbytes_remaining); + + if (err == (size_t) -1) + { + switch (errno) + { + case EINVAL: + /* Incomplete text, do not report an error */ + break; + case E2BIG: + { + size_t used = outp - dest; + outbuf_size *= 2; + dest = g_realloc (dest, outbuf_size); + + outp = dest + used; + outbytes_remaining = outbuf_size - used - 1; /* -1 for nul */ + + goto again; + } + case EILSEQ: + g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, + "Invalid byte sequence in conversion input"); + have_error = TRUE; + break; + default: + g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_OTHER, + "Error during conversion: %s", + strerror (errno)); + have_error = TRUE; + break; + } + } + + *outp = '\0'; + + iconv_close (cd); + + if (bytes_read) + *bytes_read = p - str; + + if (bytes_written) + *bytes_written = outp - dest; /* Doesn't include '\0' */ + + if (have_error) + { + g_free (dest); + return NULL; + } + else + return dest; +} + +/** + * g_convert_with_fallback: + * @str: the string to convert + * @len: the length of the string + * @to_codeset: name of character set into which to convert @str + * @from_codeset: character set of @str. + * @fallback: UTF-8 string to use in place of character not + * present in the target encoding. (This must be + * in the target encoding), if %NULL, characters + * not in the target encoding will be represented + * as Unicode escapes \x{XXXX} or \x{XXXXXX}. + * @bytes_read: location to store the number of bytes in the + * input string that were successfully converted, or %NULL. + * Even if the conversion was succesful, this may be + * less than len if there were partial characters + * at the end of the input. If the error + * G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value + * stored will the byte fofset after the last valid + * input sequence. + * @error: location to store the error occuring, or %NULL to ignore + * errors. Any of the errors in #GConvertError may occur. + * + * Convert a string from one character set to another, possibly + * including fallback sequences for characters not representable + * in the output. Note that it is not guaranteed that the specification + * for the fallback sequences in @fallback will be honored. Some + * systems may do a approximate conversion from @from_codeset + * to @to_codeset in their iconv() functions, in which case GLib + * will simply return that approximate conversion. + * + * Return value: If the conversion was successful, a newly allocated + * NUL-terminated string, which must be freed with + * g_free. Otherwise %NULL and @error will be set. + **/ +gchar* +g_convert_with_fallback (const gchar *str, + gint len, + const gchar *to_codeset, + const gchar *from_codeset, + gchar *fallback, + gint *bytes_read, + gint *bytes_written, + GError **error) +{ + gchar *utf8; + gchar *dest; + gchar *outp; + const gchar *insert_str = NULL; + const gchar *p; + size_t inbytes_remaining; + const gchar *save_p = NULL; + size_t save_inbytes = 0; + size_t outbytes_remaining; + size_t err; + iconv_t cd; + size_t outbuf_size; + gboolean have_error = FALSE; + gboolean done = FALSE; + + GError *local_error = NULL; + + g_return_val_if_fail (str != NULL, NULL); + g_return_val_if_fail (to_codeset != NULL, NULL); + g_return_val_if_fail (from_codeset != NULL, NULL); + + if (len < 0) + len = strlen (str); + + /* Try an exact conversion; we only proceed if this fails + * due to an illegal sequence in the input string. + */ + dest = g_convert (str, len, to_codeset, from_codeset, + bytes_read, bytes_written, &local_error); + if (!local_error) + return dest; + + if (!g_error_matches (local_error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE)) + { + g_propagate_error (error, local_error); + return NULL; + } + else + g_error_free (local_error); + + /* No go; to proceed, we need a converter from "UTF-8" to + * to_codeset, and the string as UTF-8. + */ + cd = open_converter (to_codeset, "UTF-8", error); + if (cd == (iconv_t) -1) + { + if (bytes_read) + *bytes_read = 0; + + if (bytes_written) + *bytes_written = 0; + + return NULL; + } + + utf8 = g_convert (str, len, "UTF-8", from_codeset, + bytes_read, &inbytes_remaining, error); + if (!utf8) + return NULL; + + /* Now the heart of the code. We loop through the UTF-8 string, and + * whenever we hit an offending character, we form fallback, convert + * the fallback to the target codeset, and then go back to + * converting the original string after finishing with the fallback. + * + * The variables save_p and save_inbytes store the input state + * for the original string while we are converting the fallback + */ + p = utf8; + outbuf_size = len + 1; /* + 1 for nul in case len == 1 */ + outbytes_remaining = outbuf_size - 1; /* -1 for nul */ + outp = dest = g_malloc (outbuf_size); + + while (!done && !have_error) + { + err = iconv (cd, &p, &inbytes_remaining, &outp, &outbytes_remaining); + + if (err == (size_t) -1) + { + switch (errno) + { + case EINVAL: + g_assert_not_reached(); + break; + case E2BIG: + { + size_t used = outp - dest; + outbuf_size *= 2; + dest = g_realloc (dest, outbuf_size); + + outp = dest + used; + outbytes_remaining = outbuf_size - used - 1; /* -1 for nul */ + + break; + } + case EILSEQ: + if (save_p) + { + /* Error converting fallback string - fatal + */ + g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, + "Cannot convert fallback '%s' to codeset '%s'", + insert_str, to_codeset); + have_error = TRUE; + break; + } + else + { + if (!fallback) + { + gunichar ch = g_utf8_get_char (p); + insert_str = g_strdup_printf ("\\x{%0*X}", + (ch < 0x10000) ? 4 : 6, + ch); + } + else + insert_str = fallback; + + save_p = g_utf8_next_char (p); + save_inbytes = inbytes_remaining - (save_p - p); + p = insert_str; + inbytes_remaining = strlen (p); + } + break; + default: + g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_OTHER, + "Error during conversion: %s", + strerror (errno)); + have_error = TRUE; + break; + } + } + else + { + if (save_p) + { + if (!fallback) + g_free ((gchar *)insert_str); + p = save_p; + inbytes_remaining = save_inbytes; + save_p = NULL; + } + else + done = TRUE; + } + } + + /* Cleanup + */ + *outp = '\0'; + + iconv_close (cd); + + if (bytes_written) + *bytes_written = outp - str; /* Doesn't include '\0' */ + + g_free (utf8); + + if (have_error) + { + if (save_p && !fallback) + g_free ((gchar *)insert_str); + g_free (dest); + return NULL; + } + else + return dest; +} diff --git a/glib.h b/glib.h index cd5c3d9..0c4d9f5 100644 --- a/glib.h +++ b/glib.h @@ -3403,6 +3403,32 @@ guint g_thread_pool_get_num_unused_threads (void); /* Stop all currently unused threads, but leave the limit untouched */ void g_thread_pool_stop_unused_threads (void); +typedef enum +{ + G_CONVERT_ERROR_NO_CONVERSION, + G_CONVERT_ERROR_ILLEGAL_SEQUENCE, + G_CONVERT_ERROR_OTHER +} GConvertError; + +#define G_CONVERT_ERROR g_convert_error_quark() +GQuark g_convert_error_quark(); + +gchar* g_convert (const gchar *str, + gint len, + const gchar *to_codeset, + const gchar *from_codeset, + gint *bytes_read, + gint *bytes_written, + GError **error); +gchar* g_convert_with_fallback (const gchar *str, + gint len, + const gchar *to_codeset, + const gchar *from_codeset, + gchar *fallback, + gint *bytes_read, + gint *bytes_written, + GError **error); + #ifdef __cplusplus } #endif /* __cplusplus */ diff --git a/glib/Makefile.am b/glib/Makefile.am index 9f73023..ff411f4 100644 --- a/glib/Makefile.am +++ b/glib/Makefile.am @@ -45,6 +45,7 @@ libglib_1_3_la_SOURCES = \ gbacktrace.c \ gcache.c \ gcompletion.c \ + gconvert.c \ gdataset.c \ gdate.c \ gerror.c \ diff --git a/glib/gconvert.c b/glib/gconvert.c new file mode 100644 index 0000000..b1dfeab --- /dev/null +++ b/glib/gconvert.c @@ -0,0 +1,396 @@ +/* GLIB - Library of useful routines for C programming + * + * gconvert.c: Convert between character sets using iconv + * Copyright Red Hat Inc., 2000 + * Authors: Havoc Pennington , Owen Taylor +#include +#include + +#include "glib.h" + +GQuark +g_convert_error_quark() +{ + static GQuark quark; + if (!quark) + quark = g_quark_from_static_string ("g_convert_error"); + return quark; +} + +static iconv_t +open_converter (const gchar *to_codeset, + const gchar *from_codeset, + GError **error) +{ + iconv_t cd = iconv_open (to_codeset, from_codeset); + + if (cd == (iconv_t) -1) + { + /* Something went wrong. */ + if (errno == EINVAL) + g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_NO_CONVERSION, + "Conversion from character set `%s' to `%s' is not supported", + from_codeset, to_codeset); + else + g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_OTHER, + "Could not open converter from `%s' to `%s': %s", + from_codeset, to_codeset, strerror (errno)); + } + + return cd; + +} + +/** + * g_convert: + * @str: the string to convert + * @len: the length of the string + * @to_codeset: name of character set into which to convert @str + * @from_codeset: character set of @str. + * @bytes_read: location to store the number of bytes in the + * input string that were successfully converted, or %NULL. + * Even if the conversion was succesful, this may be + * less than len if there were partial characters + * at the end of the input. If the error + * G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value + * stored will the byte fofset after the last valid + * input sequence. + * @error: location to store the error occuring, or %NULL to ignore + * errors. Any of the errors in #GConvertError may occur. + * + * Convert a string from one character set to another. + * + * Return value: If the conversion was successful, a newly allocated + * NUL-terminated string, which must be freed with + * g_free. Otherwise %NULL and @error will be set. + **/ +gchar* +g_convert (const gchar *str, + gint len, + const gchar *to_codeset, + const gchar *from_codeset, + gint *bytes_read, + gint *bytes_written, + GError **error) +{ + gchar *dest; + gchar *outp; + const gchar *p; + size_t inbytes_remaining; + size_t outbytes_remaining; + size_t err; + iconv_t cd; + size_t outbuf_size; + gboolean have_error = FALSE; + + g_return_val_if_fail (str != NULL, NULL); + g_return_val_if_fail (to_codeset != NULL, NULL); + g_return_val_if_fail (from_codeset != NULL, NULL); + + cd = open_converter (to_codeset, from_codeset, error); + + if (cd == (iconv_t) -1) + { + if (bytes_read) + *bytes_read = 0; + + if (bytes_written) + *bytes_written = 0; + + return NULL; + } + + if (len < 0) + len = strlen (str); + + p = str; + inbytes_remaining = len; + outbuf_size = len + 1; /* + 1 for nul in case len == 1 */ + outbytes_remaining = outbuf_size - 1; /* -1 for nul */ + outp = dest = g_malloc (outbuf_size); + + again: + + err = iconv (cd, &p, &inbytes_remaining, &outp, &outbytes_remaining); + + if (err == (size_t) -1) + { + switch (errno) + { + case EINVAL: + /* Incomplete text, do not report an error */ + break; + case E2BIG: + { + size_t used = outp - dest; + outbuf_size *= 2; + dest = g_realloc (dest, outbuf_size); + + outp = dest + used; + outbytes_remaining = outbuf_size - used - 1; /* -1 for nul */ + + goto again; + } + case EILSEQ: + g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, + "Invalid byte sequence in conversion input"); + have_error = TRUE; + break; + default: + g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_OTHER, + "Error during conversion: %s", + strerror (errno)); + have_error = TRUE; + break; + } + } + + *outp = '\0'; + + iconv_close (cd); + + if (bytes_read) + *bytes_read = p - str; + + if (bytes_written) + *bytes_written = outp - dest; /* Doesn't include '\0' */ + + if (have_error) + { + g_free (dest); + return NULL; + } + else + return dest; +} + +/** + * g_convert_with_fallback: + * @str: the string to convert + * @len: the length of the string + * @to_codeset: name of character set into which to convert @str + * @from_codeset: character set of @str. + * @fallback: UTF-8 string to use in place of character not + * present in the target encoding. (This must be + * in the target encoding), if %NULL, characters + * not in the target encoding will be represented + * as Unicode escapes \x{XXXX} or \x{XXXXXX}. + * @bytes_read: location to store the number of bytes in the + * input string that were successfully converted, or %NULL. + * Even if the conversion was succesful, this may be + * less than len if there were partial characters + * at the end of the input. If the error + * G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value + * stored will the byte fofset after the last valid + * input sequence. + * @error: location to store the error occuring, or %NULL to ignore + * errors. Any of the errors in #GConvertError may occur. + * + * Convert a string from one character set to another, possibly + * including fallback sequences for characters not representable + * in the output. Note that it is not guaranteed that the specification + * for the fallback sequences in @fallback will be honored. Some + * systems may do a approximate conversion from @from_codeset + * to @to_codeset in their iconv() functions, in which case GLib + * will simply return that approximate conversion. + * + * Return value: If the conversion was successful, a newly allocated + * NUL-terminated string, which must be freed with + * g_free. Otherwise %NULL and @error will be set. + **/ +gchar* +g_convert_with_fallback (const gchar *str, + gint len, + const gchar *to_codeset, + const gchar *from_codeset, + gchar *fallback, + gint *bytes_read, + gint *bytes_written, + GError **error) +{ + gchar *utf8; + gchar *dest; + gchar *outp; + const gchar *insert_str = NULL; + const gchar *p; + size_t inbytes_remaining; + const gchar *save_p = NULL; + size_t save_inbytes = 0; + size_t outbytes_remaining; + size_t err; + iconv_t cd; + size_t outbuf_size; + gboolean have_error = FALSE; + gboolean done = FALSE; + + GError *local_error = NULL; + + g_return_val_if_fail (str != NULL, NULL); + g_return_val_if_fail (to_codeset != NULL, NULL); + g_return_val_if_fail (from_codeset != NULL, NULL); + + if (len < 0) + len = strlen (str); + + /* Try an exact conversion; we only proceed if this fails + * due to an illegal sequence in the input string. + */ + dest = g_convert (str, len, to_codeset, from_codeset, + bytes_read, bytes_written, &local_error); + if (!local_error) + return dest; + + if (!g_error_matches (local_error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE)) + { + g_propagate_error (error, local_error); + return NULL; + } + else + g_error_free (local_error); + + /* No go; to proceed, we need a converter from "UTF-8" to + * to_codeset, and the string as UTF-8. + */ + cd = open_converter (to_codeset, "UTF-8", error); + if (cd == (iconv_t) -1) + { + if (bytes_read) + *bytes_read = 0; + + if (bytes_written) + *bytes_written = 0; + + return NULL; + } + + utf8 = g_convert (str, len, "UTF-8", from_codeset, + bytes_read, &inbytes_remaining, error); + if (!utf8) + return NULL; + + /* Now the heart of the code. We loop through the UTF-8 string, and + * whenever we hit an offending character, we form fallback, convert + * the fallback to the target codeset, and then go back to + * converting the original string after finishing with the fallback. + * + * The variables save_p and save_inbytes store the input state + * for the original string while we are converting the fallback + */ + p = utf8; + outbuf_size = len + 1; /* + 1 for nul in case len == 1 */ + outbytes_remaining = outbuf_size - 1; /* -1 for nul */ + outp = dest = g_malloc (outbuf_size); + + while (!done && !have_error) + { + err = iconv (cd, &p, &inbytes_remaining, &outp, &outbytes_remaining); + + if (err == (size_t) -1) + { + switch (errno) + { + case EINVAL: + g_assert_not_reached(); + break; + case E2BIG: + { + size_t used = outp - dest; + outbuf_size *= 2; + dest = g_realloc (dest, outbuf_size); + + outp = dest + used; + outbytes_remaining = outbuf_size - used - 1; /* -1 for nul */ + + break; + } + case EILSEQ: + if (save_p) + { + /* Error converting fallback string - fatal + */ + g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, + "Cannot convert fallback '%s' to codeset '%s'", + insert_str, to_codeset); + have_error = TRUE; + break; + } + else + { + if (!fallback) + { + gunichar ch = g_utf8_get_char (p); + insert_str = g_strdup_printf ("\\x{%0*X}", + (ch < 0x10000) ? 4 : 6, + ch); + } + else + insert_str = fallback; + + save_p = g_utf8_next_char (p); + save_inbytes = inbytes_remaining - (save_p - p); + p = insert_str; + inbytes_remaining = strlen (p); + } + break; + default: + g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_OTHER, + "Error during conversion: %s", + strerror (errno)); + have_error = TRUE; + break; + } + } + else + { + if (save_p) + { + if (!fallback) + g_free ((gchar *)insert_str); + p = save_p; + inbytes_remaining = save_inbytes; + save_p = NULL; + } + else + done = TRUE; + } + } + + /* Cleanup + */ + *outp = '\0'; + + iconv_close (cd); + + if (bytes_written) + *bytes_written = outp - str; /* Doesn't include '\0' */ + + g_free (utf8); + + if (have_error) + { + if (save_p && !fallback) + g_free ((gchar *)insert_str); + g_free (dest); + return NULL; + } + else + return dest; +} diff --git a/glib/glib.h b/glib/glib.h index cd5c3d9..0c4d9f5 100644 --- a/glib/glib.h +++ b/glib/glib.h @@ -3403,6 +3403,32 @@ guint g_thread_pool_get_num_unused_threads (void); /* Stop all currently unused threads, but leave the limit untouched */ void g_thread_pool_stop_unused_threads (void); +typedef enum +{ + G_CONVERT_ERROR_NO_CONVERSION, + G_CONVERT_ERROR_ILLEGAL_SEQUENCE, + G_CONVERT_ERROR_OTHER +} GConvertError; + +#define G_CONVERT_ERROR g_convert_error_quark() +GQuark g_convert_error_quark(); + +gchar* g_convert (const gchar *str, + gint len, + const gchar *to_codeset, + const gchar *from_codeset, + gint *bytes_read, + gint *bytes_written, + GError **error); +gchar* g_convert_with_fallback (const gchar *str, + gint len, + const gchar *to_codeset, + const gchar *from_codeset, + gchar *fallback, + gint *bytes_read, + gint *bytes_written, + GError **error); + #ifdef __cplusplus } #endif /* __cplusplus */ -- 2.7.4