#include <string.h>
#include <stdlib.h>
-#include "galias.h"
#include "glib.h"
#include "gprintfint.h"
#include "gthreadinit.h"
#error GNU libiconv not in use but included iconv.h is from libiconv
#endif
+#include "galias.h"
+
GQuark
g_convert_error_quark (void)
{
}
+#ifdef NEED_ICONV_CACHE
+
#define ICONV_CACHE_SIZE (16)
struct _iconv_cache_bucket {
G_UNLOCK (iconv_cache_lock);
/* Something went wrong. */
- if (errno == EINVAL)
- g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_NO_CONVERSION,
- _("Conversion from character set '%s' to '%s' is not supported"),
- from_codeset, to_codeset);
- else
- g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_FAILED,
- _("Could not open converter from '%s' to '%s'"),
- from_codeset, to_codeset);
+ if (error)
+ {
+ if (errno == EINVAL)
+ g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_NO_CONVERSION,
+ _("Conversion from character set '%s' to '%s' is not supported"),
+ from_codeset, to_codeset);
+ else
+ g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_FAILED,
+ _("Could not open converter from '%s' to '%s'"),
+ from_codeset, to_codeset);
+ }
return cd;
}
return 0;
}
+#else /* !NEED_ICONV_CACHE */
-/**
- * g_convert:
- * @str: the string to convert
- * @len: the length of the string
- * @to_codeset: name of character set into which to convert @str
- * @from_codeset: character set of @str.
- * @bytes_read: location to store the number of bytes in the
- * input string that were successfully converted, or %NULL.
- * Even if the conversion was successful, this may be
- * less than @len if there were partial characters
- * at the end of the input. If the error
- * #G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value
- * stored will the byte offset after the last valid
- * input sequence.
- * @bytes_written: the number of bytes stored in the output buffer (not
- * including the terminating nul).
- * @error: location to store the error occuring, or %NULL to ignore
- * errors. Any of the errors in #GConvertError may occur.
- *
- * Converts a string from one character set to another.
- *
- * Return value: If the conversion was successful, a newly allocated
- * nul-terminated string, which must be freed with
- * g_free(). Otherwise %NULL and @error will be set.
- **/
-gchar*
-g_convert (const gchar *str,
- gssize len,
- const gchar *to_codeset,
- const gchar *from_codeset,
- gsize *bytes_read,
- gsize *bytes_written,
- GError **error)
+static GIConv
+open_converter (const gchar *to_codeset,
+ const gchar *from_codeset,
+ GError **error)
{
- gchar *res;
GIConv cd;
-
- g_return_val_if_fail (str != NULL, NULL);
- g_return_val_if_fail (to_codeset != NULL, NULL);
- g_return_val_if_fail (from_codeset != NULL, NULL);
-
- cd = open_converter (to_codeset, from_codeset, error);
+
+ cd = g_iconv_open (to_codeset, from_codeset);
if (cd == (GIConv) -1)
{
- if (bytes_read)
- *bytes_read = 0;
-
- if (bytes_written)
- *bytes_written = 0;
-
- return NULL;
+ /* Something went wrong. */
+ if (error)
+ {
+ if (errno == EINVAL)
+ g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_NO_CONVERSION,
+ _("Conversion from character set '%s' to '%s' is not supported"),
+ from_codeset, to_codeset);
+ else
+ g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_FAILED,
+ _("Could not open converter from '%s' to '%s'"),
+ from_codeset, to_codeset);
+ }
}
-
- res = g_convert_with_iconv (str, len, cd,
- bytes_read, bytes_written,
- error);
- close_converter (cd);
+ return cd;
+}
- return res;
+static int
+close_converter (GIConv cd)
+{
+ if (cd == (GIConv) -1)
+ return 0;
+
+ return g_iconv_close (cd);
}
+#endif /* NEED_ICONV_CACHE */
+
/**
* g_convert_with_iconv:
* @str: the string to convert
- * @len: the length of the string
+ * @len: the length of the string, or -1 if the string is
+ * nul-terminated<footnoteref linkend="nul-unsafe"/>.
* @converter: conversion descriptor from g_iconv_open()
* @bytes_read: location to store the number of bytes in the
* input string that were successfully converted, or %NULL.
* @error: location to store the error occuring, or %NULL to ignore
* errors. Any of the errors in #GConvertError may occur.
*
- * Converts a string from one character set to another.
+ * Converts a string from one character set to another.
+ *
+ * Note that you should use g_iconv() for streaming
+ * conversions<footnote id="streaming-state">
+ * <para>
+ * Despite the fact that @byes_read can return information about partial
+ * characters, the <literal>g_convert_...</literal> functions
+ * are not generally suitable for streaming. If the underlying converter
+ * being used maintains internal state, then this won't be preserved
+ * across successive calls to g_convert(), g_convert_with_iconv() or
+ * g_convert_with_fallback(). (An example of this is the GNU C converter
+ * for CP1255 which does not emit a base character until it knows that
+ * the next character is not a mark that could combine with the base
+ * character.)
+ * </para>
+ * </footnote>.
*
* Return value: If the conversion was successful, a newly allocated
* nul-terminated string, which must be freed with
gchar *dest;
gchar *outp;
const gchar *p;
+ const gchar *shift_p = NULL;
gsize inbytes_remaining;
gsize outbytes_remaining;
gsize err;
gsize outbuf_size;
gboolean have_error = FALSE;
+ gboolean done = FALSE;
- g_return_val_if_fail (str != NULL, NULL);
g_return_val_if_fail (converter != (GIConv) -1, NULL);
if (len < 0)
outbytes_remaining = outbuf_size - 1; /* -1 for nul */
outp = dest = g_malloc (outbuf_size);
- again:
-
- err = g_iconv (converter, (char **)&p, &inbytes_remaining, &outp, &outbytes_remaining);
-
- if (err == (size_t) -1)
+ while (!done && !have_error)
{
- switch (errno)
+ err = g_iconv (converter, (char **)&p, &inbytes_remaining, &outp, &outbytes_remaining);
+
+ if (err == (size_t) -1)
{
- case EINVAL:
- /* Incomplete text, do not report an error */
- break;
- case E2BIG:
- {
- size_t used = outp - dest;
-
- outbuf_size *= 2;
- dest = g_realloc (dest, outbuf_size);
+ switch (errno)
+ {
+ case EINVAL:
+ /* Incomplete text, do not report an error */
+ done = TRUE;
+ break;
+ case E2BIG:
+ {
+ size_t used = outp - dest;
- outp = dest + used;
- outbytes_remaining = outbuf_size - used - 1; /* -1 for nul */
-
- goto again;
- }
- case EILSEQ:
- g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
- _("Invalid byte sequence in conversion input"));
- have_error = TRUE;
- break;
- default:
- g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_FAILED,
- _("Error during conversion: %s"),
- g_strerror (errno));
- have_error = TRUE;
- break;
+ outbuf_size *= 2;
+ dest = g_realloc (dest, outbuf_size);
+
+ outp = dest + used;
+ outbytes_remaining = outbuf_size - used - 1; /* -1 for nul */
+ }
+ break;
+ case EILSEQ:
+ if (error)
+ g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
+ _("Invalid byte sequence in conversion input"));
+ have_error = TRUE;
+ break;
+ default:
+ if (error)
+ g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_FAILED,
+ _("Error during conversion: %s"),
+ g_strerror (errno));
+ have_error = TRUE;
+ break;
+ }
+ }
+ else
+ {
+ if (!shift_p)
+ {
+ /* call g_iconv with NULL inbuf to cleanup shift state */
+ shift_p = p;
+ p = NULL;
+ inbytes_remaining = 0;
+ }
+ else
+ done = TRUE;
}
}
+ if (shift_p)
+ p = shift_p;
+
*outp = '\0';
if (bytes_read)
{
if (!have_error)
{
- g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT,
- _("Partial character sequence at end of input"));
+ if (error)
+ g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT,
+ _("Partial character sequence at end of input"));
have_error = TRUE;
}
}
}
/**
+ * g_convert:
+ * @str: the string to convert
+ * @len: the length of the string, or -1 if the string is
+ * nul-terminated<footnote id="nul-unsafe">
+ <para>
+ Note that some encodings may allow nul bytes to
+ occur inside strings. In that case, using -1 for
+ the @len parameter is unsafe.
+ </para>
+ </footnote>.
+ * @to_codeset: name of character set into which to convert @str
+ * @from_codeset: character set of @str.
+ * @bytes_read: location to store the number of bytes in the
+ * input string that were successfully converted, or %NULL.
+ * Even if the conversion was successful, this may be
+ * less than @len if there were partial characters
+ * at the end of the input. If the error
+ * #G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value
+ * stored will the byte offset after the last valid
+ * input sequence.
+ * @bytes_written: the number of bytes stored in the output buffer (not
+ * including the terminating nul).
+ * @error: location to store the error occuring, or %NULL to ignore
+ * errors. Any of the errors in #GConvertError may occur.
+ *
+ * Converts a string from one character set to another.
+ *
+ * Note that you should use g_iconv() for streaming
+ * conversions<footnoteref linkend="streaming-state"/>.
+ *
+ * Return value: If the conversion was successful, a newly allocated
+ * nul-terminated string, which must be freed with
+ * g_free(). Otherwise %NULL and @error will be set.
+ **/
+gchar*
+g_convert (const gchar *str,
+ gssize len,
+ const gchar *to_codeset,
+ const gchar *from_codeset,
+ gsize *bytes_read,
+ gsize *bytes_written,
+ GError **error)
+{
+ gchar *res;
+ GIConv cd;
+
+ g_return_val_if_fail (str != NULL, NULL);
+ g_return_val_if_fail (to_codeset != NULL, NULL);
+ g_return_val_if_fail (from_codeset != NULL, NULL);
+
+ cd = open_converter (to_codeset, from_codeset, error);
+
+ if (cd == (GIConv) -1)
+ {
+ if (bytes_read)
+ *bytes_read = 0;
+
+ if (bytes_written)
+ *bytes_written = 0;
+
+ return NULL;
+ }
+
+ res = g_convert_with_iconv (str, len, cd,
+ bytes_read, bytes_written,
+ error);
+
+ close_converter (cd);
+
+ return res;
+}
+
+/**
* g_convert_with_fallback:
* @str: the string to convert
- * @len: the length of the string
+ * @len: the length of the string, or -1 if the string is
+ * nul-terminated<footnoteref linkend="nul-unsafe"/>.
* @to_codeset: name of character set into which to convert @str
* @from_codeset: character set of @str.
* @fallback: UTF-8 string to use in place of character not
- * present in the target encoding. (This must be
- * in the target encoding), if %NULL, characters
- * not in the target encoding will be represented
- * as Unicode escapes \uxxxx or \Uxxxxyyyy.
+ * present in the target encoding. (The string must be
+ * representable in the target encoding).
+ If %NULL, characters not in the target encoding will
+ be represented as Unicode escapes \uxxxx or \Uxxxxyyyy.
* @bytes_read: location to store the number of bytes in the
* input string that were successfully converted, or %NULL.
* Even if the conversion was successful, this may be
* to @to_codeset in their iconv() functions,
* in which case GLib will simply return that approximate conversion.
*
+ * Note that you should use g_iconv() for streaming
+ * conversions<footnoteref linkend="streaming-state"/>.
+ *
* Return value: If the conversion was successful, a newly allocated
* nul-terminated string, which must be freed with
* g_free(). Otherwise %NULL and @error will be set.
have_error = TRUE;
break;
}
- else
+ else if (p)
{
if (!fallback)
{
save_inbytes = inbytes_remaining - (save_p - p);
p = insert_str;
inbytes_remaining = strlen (p);
+ break;
}
- break;
+ /* fall thru if p is NULL */
default:
g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_FAILED,
_("Error during conversion: %s"),
inbytes_remaining = save_inbytes;
save_p = NULL;
}
+ else if (p)
+ {
+ /* call g_iconv with NULL inbuf to cleanup shift state */
+ p = NULL;
+ inbytes_remaining = 0;
+ }
else
done = TRUE;
}
/**
* g_locale_to_utf8:
- * @opsysstring: a string in the encoding of the current locale
+ * @opsysstring: a string in the encoding of the current locale. On Windows
+ * this means the system codepage.
* @len: the length of the string, or -1 if the string is
- * nul-terminated.
+ * nul-terminated<footnoteref linkend="nul-unsafe"/>.
* @bytes_read: location to store the number of bytes in the
* input string that were successfully converted, or %NULL.
* Even if the conversion was successful, this may be
* g_locale_from_utf8:
* @utf8string: a UTF-8 encoded string
* @len: the length of the string, or -1 if the string is
- * nul-terminated.
+ * nul-terminated<footnoteref linkend="nul-unsafe"/>.
* @bytes_read: location to store the number of bytes in the
* input string that were successfully converted, or %NULL.
* Even if the conversion was successful, this may be
cache->charset = g_strdup (charset);
p = getenv ("G_FILENAME_ENCODING");
- if (p != NULL)
+ if (p != NULL && p[0] != '\0')
{
cache->filename_charsets = g_strsplit (p, ",", 0);
cache->is_utf8 = (strcmp (cache->filename_charsets[0], "UTF-8") == 0);
gboolean
g_get_filename_charsets (G_CONST_RETURN gchar ***filename_charsets)
{
- static gchar *charsets[] = {
+ static const gchar *charsets[] = {
"UTF-8",
NULL
};
* g_filename_to_utf8:
* @opsysstring: a string in the encoding for filenames
* @len: the length of the string, or -1 if the string is
- * nul-terminated.
+ * nul-terminated<footnoteref linkend="nul-unsafe"/>.
* @bytes_read: location to store the number of bytes in the
* input string that were successfully converted, or %NULL.
* Even if the conversion was successful, this may be
* @error: location to store the error occuring, or %NULL to ignore
* errors. Any of the errors in #GConvertError may occur.
*
- * Converts a string which is in the encoding used by GLib for filenames
- * into a UTF-8 string.
+ * Converts a string which is in the encoding used by GLib for
+ * filenames into a UTF-8 string. Note that on Windows GLib uses UTF-8
+ * for filenames.
*
* Return value: The converted string, or %NULL on an error.
**/
* @error: location to store the error occuring, or %NULL to ignore
* errors. Any of the errors in #GConvertError may occur.
*
- * Converts a string from UTF-8 to the encoding used for filenames.
+ * Converts a string from UTF-8 to the encoding GLib uses for
+ * filenames. Note that on Windows GLib uses UTF-8 for filenames.
*
* Return value: The converted string, or %NULL on an error.
**/
/**
* g_filename_to_uri:
- * @filename: an absolute filename specified in the encoding
- * used for filenames by the operating system.
+ * @filename: an absolute filename specified in the GLib file name encoding,
+ * which is the on-disk file name bytes on Unix, and UTF-8 on
+ * Windows
* @hostname: A UTF-8 encoded hostname, or %NULL for none.
* @error: location to store the error occuring, or %NULL to ignore
* errors. Any of the errors in #GConvertError may occur.
string = g_string_sized_new (remaining_bytes);
g_string_append_len (string, remainder, valid_bytes);
- g_string_append_c (string, '?');
+ /* append U+FFFD REPLACEMENT CHARACTER */
+ g_string_append (string, "\357\277\275");
remaining_bytes -= valid_bytes + 1;
remainder = invalid + 1;
return g_strdup (name);
g_string_append (string, remainder);
- g_string_append (string, " (invalid encoding)");
g_assert (g_utf8_validate (string->str, -1, NULL));
* Returns the display basename for the particular filename, guaranteed
* to be valid UTF-8. The display name might not be identical to the filename,
* for instance there might be problems converting it to UTF-8, and some files
- * can be translated in the display
+ * can be translated in the display.
+ *
+ * If GLib can not make sense of the encoding of @filename, as a last resort it
+ * replaces unknown characters with U+FFFD, the Unicode replacement character.
+ * You can search the result for the UTF-8 encoding of this character (which is
+ * "\357\277\275" in octal notation) to find out if @filename was in an invalid
+ * encoding.
*
* You must pass the whole absolute pathname to this functions so that
* translation of well known locations can be done.
* g_filename_display_name:
* @filename: a pathname hopefully in the GLib file name encoding
*
- * Converts a filename into a valid UTF-8 string. The
- * conversion is not necessarily reversible, so you
- * should keep the original around and use the return
- * value of this function only for display purposes.
- * Unlike g_filename_to_utf8(), the result is guaranteed
- * to be non-NULL even if the filename actually isn't in the GLib
- * file name encoding.
+ * Converts a filename into a valid UTF-8 string. The conversion is
+ * not necessarily reversible, so you should keep the original around
+ * and use the return value of this function only for display purposes.
+ * Unlike g_filename_to_utf8(), the result is guaranteed to be non-%NULL
+ * even if the filename actually isn't in the GLib file name encoding.
+ *
+ * If GLib can not make sense of the encoding of @filename, as a last resort it
+ * replaces unknown characters with U+FFFD, the Unicode replacement character.
+ * You can search the result for the UTF-8 encoding of this character (which is
+ * "\357\277\275" in octal notation) to find out if @filename was in an invalid
+ * encoding.
*
* If you know the whole pathname of the file you should use
* g_filename_display_basename(), since that allows location-based
return display_name;
}
+#define __G_CONVERT_C__
+#include "galiasdef.c"