Fix bug 326747, Alberto Ruiz:

[platform/upstream/glib.git] / glib / gconvert.c
diff --git a/glib/gconvert.c b/glib/gconvert.c

index 1458759..5b0bb14 100644 (file)
--- a/glib/gconvert.c
+++ b/glib/gconvert.c
@@ -28,7 +28,6 @@
  #include <string.h>
  #include <stdlib.h>
  
-#include "galias.h"
  #include "glib.h"
  #include "gprintfint.h"
  #include "gthreadinit.h"
@@ -48,6 +47,8 @@
  #error GNU libiconv not in use but included iconv.h is from libiconv
  #endif
  
+#include "galias.h"
+
  GQuark 
  g_convert_error_quark (void)
  {
@@ -195,6 +196,8 @@ g_iconv_close (GIConv converter)
  }
  
  
+#ifdef NEED_ICONV_CACHE
+
  #define ICONV_CACHE_SIZE   (16)
  
  struct _iconv_cache_bucket {
@@ -398,14 +401,17 @@ open_converter (const gchar *to_codeset,
    G_UNLOCK (iconv_cache_lock);
    
    /* Something went wrong.  */
-  if (errno == EINVAL)
-    g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_NO_CONVERSION,
-                _("Conversion from character set '%s' to '%s' is not supported"),
-                from_codeset, to_codeset);
-  else
-    g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_FAILED,
-                _("Could not open converter from '%s' to '%s'"),
-                from_codeset, to_codeset);
+  if (error)
+    {
+      if (errno == EINVAL)
+       g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_NO_CONVERSION,
+                    _("Conversion from character set '%s' to '%s' is not supported"),
+                    from_codeset, to_codeset);
+      else
+       g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_FAILED,
+                    _("Could not open converter from '%s' to '%s'"),
+                    from_codeset, to_codeset);
+    }
    
    return cd;
  }
@@ -459,74 +465,52 @@ close_converter (GIConv converter)
    return 0;
  }
  
+#else  /* !NEED_ICONV_CACHE */
  
-/**
- * g_convert:
- * @str:           the string to convert
- * @len:           the length of the string
- * @to_codeset:    name of character set into which to convert @str
- * @from_codeset:  character set of @str.
- * @bytes_read:    location to store the number of bytes in the
- *                 input string that were successfully converted, or %NULL.
- *                 Even if the conversion was successful, this may be 
- *                 less than @len if there were partial characters
- *                 at the end of the input. If the error
- *                 #G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value
- *                 stored will the byte offset after the last valid
- *                 input sequence.
- * @bytes_written: the number of bytes stored in the output buffer (not 
- *                 including the terminating nul).
- * @error:         location to store the error occuring, or %NULL to ignore
- *                 errors. Any of the errors in #GConvertError may occur.
- *
- * Converts a string from one character set to another.
- *
- * Return value: If the conversion was successful, a newly allocated
- *               nul-terminated string, which must be freed with
- *               g_free(). Otherwise %NULL and @error will be set.
- **/
-gchar*
-g_convert (const gchar *str,
-           gssize       len,  
-           const gchar *to_codeset,
-           const gchar *from_codeset,
-           gsize       *bytes_read, 
-          gsize       *bytes_written, 
-          GError     **error)
+static GIConv
+open_converter (const gchar *to_codeset,
+               const gchar *from_codeset,
+               GError     **error)
  {
-  gchar *res;
    GIConv cd;
-  
-  g_return_val_if_fail (str != NULL, NULL);
-  g_return_val_if_fail (to_codeset != NULL, NULL);
-  g_return_val_if_fail (from_codeset != NULL, NULL);
-  
-  cd = open_converter (to_codeset, from_codeset, error);
+
+  cd = g_iconv_open (to_codeset, from_codeset);
  
    if (cd == (GIConv) -1)
      {
-      if (bytes_read)
-        *bytes_read = 0;
-      
-      if (bytes_written)
-        *bytes_written = 0;
-      
-      return NULL;
+      /* Something went wrong.  */
+      if (error)
+       {
+         if (errno == EINVAL)
+           g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_NO_CONVERSION,
+                        _("Conversion from character set '%s' to '%s' is not supported"),
+                        from_codeset, to_codeset);
+         else
+           g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_FAILED,
+                        _("Could not open converter from '%s' to '%s'"),
+                        from_codeset, to_codeset);
+       }
      }
-
-  res = g_convert_with_iconv (str, len, cd,
-                             bytes_read, bytes_written,
-                             error);
    
-  close_converter (cd);
+  return cd;
+}
  
-  return res;
+static int
+close_converter (GIConv cd)
+{
+  if (cd == (GIConv) -1)
+    return 0;
+  
+  return g_iconv_close (cd);  
  }
  
+#endif /* NEED_ICONV_CACHE */
+
  /**
   * g_convert_with_iconv:
   * @str:           the string to convert
- * @len:           the length of the string
+ * @len:           the length of the string, or -1 if the string is 
+ *                 nul-terminated<footnoteref linkend="nul-unsafe"/>. 
   * @converter:     conversion descriptor from g_iconv_open()
   * @bytes_read:    location to store the number of bytes in the
   *                 input string that were successfully converted, or %NULL.
@@ -541,7 +525,22 @@ g_convert (const gchar *str,
   * @error:         location to store the error occuring, or %NULL to ignore
   *                 errors. Any of the errors in #GConvertError may occur.
   *
- * Converts a string from one character set to another.
+ * Converts a string from one character set to another. 
+ * 
+ * Note that you should use g_iconv() for streaming 
+ * conversions<footnote id="streaming-state">
+ *  <para>
+ * Despite the fact that @byes_read can return information about partial 
+ * characters, the <literal>g_convert_...</literal> functions
+ * are not generally suitable for streaming. If the underlying converter 
+ * being used maintains internal state, then this won't be preserved 
+ * across successive calls to g_convert(), g_convert_with_iconv() or 
+ * g_convert_with_fallback(). (An example of this is the GNU C converter 
+ * for CP1255 which does not emit a base character until it knows that 
+ * the next character is not a mark that could combine with the base 
+ * character.)
+ *  </para>
+ * </footnote>. 
   *
   * Return value: If the conversion was successful, a newly allocated
   *               nul-terminated string, which must be freed with
@@ -558,13 +557,14 @@ g_convert_with_iconv (const gchar *str,
    gchar *dest;
    gchar *outp;
    const gchar *p;
+  const gchar *shift_p = NULL;
    gsize inbytes_remaining;
    gsize outbytes_remaining;
    gsize err;
    gsize outbuf_size;
    gboolean have_error = FALSE;
+  gboolean done = FALSE;
    
-  g_return_val_if_fail (str != NULL, NULL);
    g_return_val_if_fail (converter != (GIConv) -1, NULL);
       
    if (len < 0)
@@ -577,43 +577,61 @@ g_convert_with_iconv (const gchar *str,
    outbytes_remaining = outbuf_size - 1; /* -1 for nul */
    outp = dest = g_malloc (outbuf_size);
  
- again:
-  
-  err = g_iconv (converter, (char **)&p, &inbytes_remaining, &outp, &outbytes_remaining);
-
-  if (err == (size_t) -1)
+  while (!done && !have_error)
      {
-      switch (errno)
+      err = g_iconv (converter, (char **)&p, &inbytes_remaining, &outp, &outbytes_remaining);
+
+      if (err == (size_t) -1)
         {
-       case EINVAL:
-         /* Incomplete text, do not report an error */
-         break;
-       case E2BIG:
-         {
-           size_t used = outp - dest;
-
-           outbuf_size *= 2;
-           dest = g_realloc (dest, outbuf_size);
+         switch (errno)
+           {
+           case EINVAL:
+             /* Incomplete text, do not report an error */
+             done = TRUE;
+             break;
+           case E2BIG:
+             {
+               size_t used = outp - dest;
                 
-           outp = dest + used;
-           outbytes_remaining = outbuf_size - used - 1; /* -1 for nul */
-
-           goto again;
-         }
-       case EILSEQ:
-         g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
-                      _("Invalid byte sequence in conversion input"));
-         have_error = TRUE;
-         break;
-       default:
-         g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_FAILED,
-                      _("Error during conversion: %s"),
-                      g_strerror (errno));
-         have_error = TRUE;
-         break;
+               outbuf_size *= 2;
+               dest = g_realloc (dest, outbuf_size);
+               
+               outp = dest + used;
+               outbytes_remaining = outbuf_size - used - 1; /* -1 for nul */
+             }
+             break;
+           case EILSEQ:
+             if (error)
+               g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
+                            _("Invalid byte sequence in conversion input"));
+             have_error = TRUE;
+             break;
+           default:
+             if (error)
+               g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_FAILED,
+                            _("Error during conversion: %s"),
+                            g_strerror (errno));
+             have_error = TRUE;
+             break;
+           }
+       }
+      else 
+       {
+         if (!shift_p)
+           {
+             /* call g_iconv with NULL inbuf to cleanup shift state */
+             shift_p = p;
+             p = NULL;
+             inbytes_remaining = 0;
+           }
+         else
+           done = TRUE;
         }
      }
  
+  if (shift_p)
+    p = shift_p;
+
    *outp = '\0';
    
    if (bytes_read)
@@ -624,8 +642,9 @@ g_convert_with_iconv (const gchar *str,
         {
            if (!have_error)
              {
-              g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT,
-                           _("Partial character sequence at end of input"));
+             if (error)
+               g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT,
+                            _("Partial character sequence at end of input"));
                have_error = TRUE;
              }
         }
@@ -644,16 +663,90 @@ g_convert_with_iconv (const gchar *str,
  }
  
  /**
+ * g_convert:
+ * @str:           the string to convert
+ * @len:           the length of the string, or -1 if the string is 
+ *                 nul-terminated<footnote id="nul-unsafe">
+                     <para>
+                       Note that some encodings may allow nul bytes to 
+                       occur inside strings. In that case, using -1 for 
+                       the @len parameter is unsafe.
+                     </para>
+                   </footnote>. 
+ * @to_codeset:    name of character set into which to convert @str
+ * @from_codeset:  character set of @str.
+ * @bytes_read:    location to store the number of bytes in the
+ *                 input string that were successfully converted, or %NULL.
+ *                 Even if the conversion was successful, this may be 
+ *                 less than @len if there were partial characters
+ *                 at the end of the input. If the error
+ *                 #G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value
+ *                 stored will the byte offset after the last valid
+ *                 input sequence.
+ * @bytes_written: the number of bytes stored in the output buffer (not 
+ *                 including the terminating nul).
+ * @error:         location to store the error occuring, or %NULL to ignore
+ *                 errors. Any of the errors in #GConvertError may occur.
+ *
+ * Converts a string from one character set to another.
+ *
+ * Note that you should use g_iconv() for streaming 
+ * conversions<footnoteref linkend="streaming-state"/>.
+ *
+ * Return value: If the conversion was successful, a newly allocated
+ *               nul-terminated string, which must be freed with
+ *               g_free(). Otherwise %NULL and @error will be set.
+ **/
+gchar*
+g_convert (const gchar *str,
+           gssize       len,  
+           const gchar *to_codeset,
+           const gchar *from_codeset,
+           gsize       *bytes_read, 
+          gsize       *bytes_written, 
+          GError     **error)
+{
+  gchar *res;
+  GIConv cd;
+
+  g_return_val_if_fail (str != NULL, NULL);
+  g_return_val_if_fail (to_codeset != NULL, NULL);
+  g_return_val_if_fail (from_codeset != NULL, NULL);
+  
+  cd = open_converter (to_codeset, from_codeset, error);
+
+  if (cd == (GIConv) -1)
+    {
+      if (bytes_read)
+        *bytes_read = 0;
+      
+      if (bytes_written)
+        *bytes_written = 0;
+      
+      return NULL;
+    }
+
+  res = g_convert_with_iconv (str, len, cd,
+                             bytes_read, bytes_written,
+                             error);
+
+  close_converter (cd);
+
+  return res;
+}
+
+/**
   * g_convert_with_fallback:
   * @str:          the string to convert
- * @len:          the length of the string
+ * @len:          the length of the string, or -1 if the string is 
+ *                nul-terminated<footnoteref linkend="nul-unsafe"/>. 
   * @to_codeset:   name of character set into which to convert @str
   * @from_codeset: character set of @str.
   * @fallback:     UTF-8 string to use in place of character not
- *                present in the target encoding. (This must be
- *                in the target encoding), if %NULL, characters
- *                not in the target encoding will be represented
- *                as Unicode escapes \uxxxx or \Uxxxxyyyy.
+ *                present in the target encoding. (The string must be
+ *                representable in the target encoding). 
+                  If %NULL, characters not in the target encoding will 
+                  be represented as Unicode escapes \uxxxx or \Uxxxxyyyy.
   * @bytes_read:   location to store the number of bytes in the
   *                input string that were successfully converted, or %NULL.
   *                Even if the conversion was successful, this may be 
@@ -672,6 +765,9 @@ g_convert_with_iconv (const gchar *str,
   * to @to_codeset in their iconv() functions, 
   * in which case GLib will simply return that approximate conversion.
   *
+ * Note that you should use g_iconv() for streaming 
+ * conversions<footnoteref linkend="streaming-state"/>.
+ *
   * Return value: If the conversion was successful, a newly allocated
   *               nul-terminated string, which must be freed with
   *               g_free(). Otherwise %NULL and @error will be set.
@@ -803,7 +899,7 @@ g_convert_with_fallback (const gchar *str,
                   have_error = TRUE;
                   break;
                 }
-             else
+             else if (p)
                 {
                   if (!fallback)
                     { 
@@ -818,8 +914,9 @@ g_convert_with_fallback (const gchar *str,
                   save_inbytes = inbytes_remaining - (save_p - p);
                   p = insert_str;
                   inbytes_remaining = strlen (p);
+                 break;
                 }
-             break;
+             /* fall thru if p is NULL */
             default:
               g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_FAILED,
                            _("Error during conversion: %s"),
@@ -838,6 +935,12 @@ g_convert_with_fallback (const gchar *str,
               inbytes_remaining = save_inbytes;
               save_p = NULL;
             }
+         else if (p)
+           {
+             /* call g_iconv with NULL inbuf to cleanup shift state */
+             p = NULL;
+             inbytes_remaining = 0;
+           }
           else
             done = TRUE;
         }
@@ -913,9 +1016,10 @@ strdup_len (const gchar *string,
  
  /**
   * g_locale_to_utf8:
- * @opsysstring:   a string in the encoding of the current locale
+ * @opsysstring:   a string in the encoding of the current locale. On Windows
+ *                 this means the system codepage.
   * @len:           the length of the string, or -1 if the string is
- *                 nul-terminated.
+ *                 nul-terminated<footnoteref linkend="nul-unsafe"/>. 
   * @bytes_read:    location to store the number of bytes in the
   *                 input string that were successfully converted, or %NULL.
   *                 Even if the conversion was successful, this may be 
@@ -955,7 +1059,7 @@ g_locale_to_utf8 (const gchar  *opsysstring,
   * g_locale_from_utf8:
   * @utf8string:    a UTF-8 encoded string 
   * @len:           the length of the string, or -1 if the string is
- *                 nul-terminated.
+ *                 nul-terminated<footnoteref linkend="nul-unsafe"/>. 
   * @bytes_read:    location to store the number of bytes in the
   *                 input string that were successfully converted, or %NULL.
   *                 Even if the conversion was successful, this may be 
@@ -1069,7 +1173,7 @@ g_get_filename_charsets (G_CONST_RETURN gchar ***filename_charsets)
        cache->charset = g_strdup (charset);
        
        p = getenv ("G_FILENAME_ENCODING");
-      if (p != NULL) 
+      if (p != NULL && p[0] != '\0') 
         {
           cache->filename_charsets = g_strsplit (p, ",", 0);
           cache->is_utf8 = (strcmp (cache->filename_charsets[0], "UTF-8") == 0);
@@ -1111,7 +1215,7 @@ g_get_filename_charsets (G_CONST_RETURN gchar ***filename_charsets)
  gboolean
  g_get_filename_charsets (G_CONST_RETURN gchar ***filename_charsets) 
  {
-  static gchar *charsets[] = {
+  static const gchar *charsets[] = {
      "UTF-8",
      NULL
    };
@@ -1165,7 +1269,7 @@ _g_convert_thread_init (void)
   * g_filename_to_utf8:
   * @opsysstring:   a string in the encoding for filenames
   * @len:           the length of the string, or -1 if the string is
- *                 nul-terminated.
+ *                 nul-terminated<footnoteref linkend="nul-unsafe"/>. 
   * @bytes_read:    location to store the number of bytes in the
   *                 input string that were successfully converted, or %NULL.
   *                 Even if the conversion was successful, this may be 
@@ -1179,8 +1283,9 @@ _g_convert_thread_init (void)
   * @error:         location to store the error occuring, or %NULL to ignore
   *                 errors. Any of the errors in #GConvertError may occur.
   * 
- * Converts a string which is in the encoding used by GLib for filenames
- * into a UTF-8 string.
+ * Converts a string which is in the encoding used by GLib for
+ * filenames into a UTF-8 string. Note that on Windows GLib uses UTF-8
+ * for filenames.
   * 
   * Return value: The converted string, or %NULL on an error.
   **/
@@ -1242,7 +1347,8 @@ g_filename_to_utf8 (const gchar *opsysstring,
   * @error:         location to store the error occuring, or %NULL to ignore
   *                 errors. Any of the errors in #GConvertError may occur.
   * 
- * Converts a string from UTF-8 to the encoding used for filenames.
+ * Converts a string from UTF-8 to the encoding GLib uses for
+ * filenames. Note that on Windows GLib uses UTF-8 for filenames.
   * 
   * Return value: The converted string, or %NULL on an error.
   **/
@@ -1722,8 +1828,9 @@ g_filename_from_uri (const gchar *uri,
  
  /**
   * g_filename_to_uri:
- * @filename: an absolute filename specified in the encoding
- *            used for filenames by the operating system.
+ * @filename: an absolute filename specified in the GLib file name encoding,
+ *            which is the on-disk file name bytes on Unix, and UTF-8 on 
+ *            Windows
   * @hostname: A UTF-8 encoded hostname, or %NULL for none.
   * @error: location to store the error occuring, or %NULL to ignore
   *         errors. Any of the errors in #GConvertError may occur.
@@ -1889,7 +1996,8 @@ make_valid_utf8 (const gchar *name)
         string = g_string_sized_new (remaining_bytes);
  
        g_string_append_len (string, remainder, valid_bytes);
-      g_string_append_c (string, '?');
+      /* append U+FFFD REPLACEMENT CHARACTER */
+      g_string_append (string, "\357\277\275");
        
        remaining_bytes -= valid_bytes + 1;
        remainder = invalid + 1;
@@ -1899,7 +2007,6 @@ make_valid_utf8 (const gchar *name)
      return g_strdup (name);
    
    g_string_append (string, remainder);
-  g_string_append (string, " (invalid encoding)");
  
    g_assert (g_utf8_validate (string->str, -1, NULL));
    
@@ -1913,7 +2020,13 @@ make_valid_utf8 (const gchar *name)
   * Returns the display basename for the particular filename, guaranteed
   * to be valid UTF-8. The display name might not be identical to the filename,
   * for instance there might be problems converting it to UTF-8, and some files
- * can be translated in the display
+ * can be translated in the display.
+ *
+ * If GLib can not make sense of the encoding of @filename, as a last resort it 
+ * replaces unknown characters with U+FFFD, the Unicode replacement character.
+ * You can search the result for the UTF-8 encoding of this character (which is
+ * "\357\277\275" in octal notation) to find out if @filename was in an invalid
+ * encoding.
   *
   * You must pass the whole absolute pathname to this functions so that
   * translation of well known locations can be done.
@@ -1944,13 +2057,17 @@ g_filename_display_basename (const gchar *filename)
   * g_filename_display_name:
   * @filename: a pathname hopefully in the GLib file name encoding
   * 
- * Converts a filename into a valid UTF-8 string. The 
- * conversion is not necessarily reversible, so you 
- * should keep the original around and use the return
- * value of this function only for display purposes.
- * Unlike g_filename_to_utf8(), the result is guaranteed 
- * to be non-NULL even if the filename actually isn't in the GLib
- * file name encoding.
+ * Converts a filename into a valid UTF-8 string. The conversion is 
+ * not necessarily reversible, so you should keep the original around 
+ * and use the return value of this function only for display purposes.
+ * Unlike g_filename_to_utf8(), the result is guaranteed to be non-%NULL 
+ * even if the filename actually isn't in the GLib file name encoding.
+ *
+ * If GLib can not make sense of the encoding of @filename, as a last resort it 
+ * replaces unknown characters with U+FFFD, the Unicode replacement character.
+ * You can search the result for the UTF-8 encoding of this character (which is
+ * "\357\277\275" in octal notation) to find out if @filename was in an invalid
+ * encoding.
   *
   * If you know the whole pathname of the file you should use
   * g_filename_display_basename(), since that allows location-based
@@ -2001,3 +2118,5 @@ g_filename_display_name (const gchar *filename)
    return display_name;
  }
  
+#define __G_CONVERT_C__
+#include "galiasdef.c"