Imported Upstream version 2.61.2

[platform/upstream/glib.git] / glib / gutf8.c
diff --git a/glib/gutf8.c b/glib/gutf8.c

index 4c5cacc..a19f720 100644 (file)
--- a/glib/gutf8.c
+++ b/glib/gutf8.c
@@ -6,7 +6,7 @@
   * This library is free software; you can redistribute it and/or
   * modify it under the terms of the GNU Lesser General Public
   * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
   *
   * This library is distributed in the hope that it will be useful,
   * but WITHOUT ANY WARRANTY; without even the implied warranty of
@@ -136,7 +136,7 @@ const gchar * const g_utf8_skip = utf8_skip_data;
   * is made to see if the character found is actually valid other than
   * it starts with an appropriate byte.
   *
- * Returns: a pointer to the found character or %NULL.
+ * Returns: (transfer none) (nullable): a pointer to the found character or %NULL.
   */
  gchar *
  g_utf8_find_prev_char (const char *str,
@@ -162,7 +162,13 @@ g_utf8_find_prev_char (const char *str,
   * is made to see if the character found is actually valid other than
   * it starts with an appropriate byte.
   * 
- * Returns: a pointer to the found character or %NULL
+ * If @end is %NULL, the return value will never be %NULL: if the end of the
+ * string is reached, a pointer to the terminating nul byte is returned. If
+ * @end is non-%NULL, the return value will be %NULL if the end of the string
+ * is reached.
+ *
+ * Returns: (transfer none) (nullable): a pointer to the found character or %NULL if @end is
+ *    set and is reached
   */
  gchar *
  g_utf8_find_next_char (const gchar *p,
@@ -193,7 +199,7 @@ g_utf8_find_next_char (const gchar *p,
   * it starts with an appropriate byte. If @p might be the first
   * character of the string, you must use g_utf8_find_prev_char() instead.
   * 
- * Returns: a pointer to the found character
+ * Returns: (transfer none) (not nullable): a pointer to the found character
   */
  gchar *
  g_utf8_prev_char (const gchar *p)
@@ -269,7 +275,7 @@ g_utf8_strlen (const gchar *p,
   * Copies a substring out of a UTF-8 encoded string.
   * The substring will contain @end_pos - @start_pos characters.
   *
- * Returns: a newly allocated copy of the requested
+ * Returns: (transfer full): a newly allocated copy of the requested
   *     substring. Free with g_free() when no longer needed.
   *
   * Since: 2.30
@@ -338,7 +344,7 @@ g_utf8_get_char (const gchar *p)
   * This limitation exists as this function is called frequently during
   * text rendering and therefore has to be as fast as possible.
   *
- * Returns: the resulting pointer
+ * Returns: (transfer none): the resulting pointer
   */
  gchar *
  g_utf8_offset_to_pointer  (const gchar *str,
@@ -406,7 +412,7 @@ g_utf8_pointer_to_offset (const gchar *str,
  
  /**
   * g_utf8_strncpy:
- * @dest: buffer to fill with characters from @src
+ * @dest: (transfer none): buffer to fill with characters from @src
   * @src: UTF-8 encoded string
   * @n: character count
   * 
@@ -415,7 +421,10 @@ g_utf8_pointer_to_offset (const gchar *str,
   * must be valid UTF-8 encoded text. (Use g_utf8_validate() on all
   * text before trying to use UTF-8 utility functions with it.)
   * 
- * Returns: @dest
+ * Note you must ensure @dest is at least 4 * @n to fit the
+ * largest possible UTF-8 characters
+ *
+ * Returns: (transfer none): @dest
   */
  gchar *
  g_utf8_strncpy (gchar       *dest,
@@ -509,7 +518,7 @@ g_unichar_to_utf8 (gunichar c,
   * in a UTF-8 encoded string, while limiting the search to @len bytes.
   * If @len is -1, allow unbounded search.
   * 
- * Returns: %NULL if the string does not contain the character, 
+ * Returns: (transfer none) (nullable): %NULL if the string does not contain the character,
   *     otherwise, a pointer to the start of the leftmost occurrence
   *     of the character in the string.
   */
@@ -537,7 +546,7 @@ g_utf8_strchr (const char *p,
   * in a UTF-8 encoded string, while limiting the search to @len bytes.
   * If @len is -1, allow unbounded search.
   * 
- * Returns: %NULL if the string does not contain the character, 
+ * Returns: (transfer none) (nullable): %NULL if the string does not contain the character,
   *     otherwise, a pointer to the start of the rightmost occurrence
   *     of the character in the string.
   */
@@ -567,6 +576,8 @@ g_utf8_get_char_extended (const  gchar *p,
    guint i, len;
    gunichar min_code;
    gunichar wc = (guchar) *p;
+  const gunichar partial_sequence = (gunichar) -2;
+  const gunichar malformed_sequence = (gunichar) -1;
  
    if (wc < 0x80)
      {
@@ -574,7 +585,7 @@ g_utf8_get_char_extended (const  gchar *p,
      }
    else if (G_UNLIKELY (wc < 0xc0))
      {
-      return (gunichar)-1;
+      return malformed_sequence;
      }
    else if (wc < 0xe0)
      {
@@ -608,7 +619,7 @@ g_utf8_get_char_extended (const  gchar *p,
      }
    else
      {
-      return (gunichar)-1;
+      return malformed_sequence;
      }
  
    if (G_UNLIKELY (max_len >= 0 && len > max_len))
@@ -616,9 +627,9 @@ g_utf8_get_char_extended (const  gchar *p,
        for (i = 1; i < max_len; i++)
         {
           if ((((guchar *)p)[i] & 0xc0) != 0x80)
-           return (gunichar)-1;
+           return malformed_sequence;
         }
-      return (gunichar)-2;
+      return partial_sequence;
      }
  
    for (i = 1; i < len; ++i)
@@ -628,9 +639,9 @@ g_utf8_get_char_extended (const  gchar *p,
        if (G_UNLIKELY ((ch & 0xc0) != 0x80))
         {
           if (ch)
-           return (gunichar)-1;
+           return malformed_sequence;
           else
-           return (gunichar)-2;
+           return partial_sequence;
         }
  
        wc <<= 6;
@@ -638,7 +649,7 @@ g_utf8_get_char_extended (const  gchar *p,
      }
  
    if (G_UNLIKELY (wc < min_code))
-    return (gunichar)-1;
+    return malformed_sequence;
  
    return wc;
  }
@@ -646,13 +657,16 @@ g_utf8_get_char_extended (const  gchar *p,
  /**
   * g_utf8_get_char_validated:
   * @p: a pointer to Unicode character encoded as UTF-8
- * @max_len: the maximum number of bytes to read, or -1, for no maximum or
- *     if @p is nul-terminated
- * 
+ * @max_len: the maximum number of bytes to read, or -1 if @p is nul-terminated
+ *
   * Convert a sequence of bytes encoded as UTF-8 to a Unicode character.
   * This function checks for incomplete characters, for invalid characters
   * such as characters that are out of the range of Unicode, and for
   * overlong encodings of valid characters.
+ *
+ * Note that g_utf8_get_char_validated() returns (gunichar)-2 if
+ * @max_len is positive and any of the bytes in the first UTF-8 character
+ * sequence are nul.
   * 
   * Returns: the resulting character. If @p points to a partial
   *     sequence at the end of a string that could begin a valid 
@@ -695,7 +709,7 @@ g_utf8_get_char_validated (const gchar *p,
   * but does no error checking on the input. A trailing 0 character
   * will be added to the string after the converted text.
   * 
- * Returns: a pointer to a newly allocated UCS-4 string.
+ * Returns: (transfer full): a pointer to a newly allocated UCS-4 string.
   *     This value must be freed with g_free().
   */
  gunichar *
@@ -820,7 +834,7 @@ try_malloc_n (gsize n_blocks, gsize n_block_bytes, GError **error)
   * representation as UCS-4. A trailing 0 character will be added to the
   * string after the converted text.
   * 
- * Returns: a pointer to a newly allocated UCS-4 string.
+ * Returns: (transfer full): a pointer to a newly allocated UCS-4 string.
   *     This value must be freed with g_free(). If an error occurs,
   *     %NULL will be returned and @error set.
   */
@@ -901,7 +915,7 @@ g_utf8_to_ucs4 (const gchar *str,
   * Convert a string from a 32-bit fixed width representation as UCS-4.
   * to UTF-8. The result will be terminated with a 0 byte.
   * 
- * Returns: a pointer to a newly allocated UTF-8 string.
+ * Returns: (transfer full): a pointer to a newly allocated UTF-8 string.
   *     This value must be freed with g_free(). If an error occurs,
   *     %NULL will be returned and @error set. In that case, @items_read
   *     will be set to the position of the first invalid input character.
@@ -988,7 +1002,7 @@ g_ucs4_to_utf8 (const gunichar *str,
   * be correctly interpreted as UTF-16, i.e. it doesn't contain
   * things unpaired surrogates.
   *
- * Returns: a pointer to a newly allocated UTF-8 string.
+ * Returns: (transfer full): a pointer to a newly allocated UTF-8 string.
   *     This value must be freed with g_free(). If an error occurs,
   *     %NULL will be returned and @error set.
   **/
@@ -1132,7 +1146,7 @@ g_utf16_to_utf8 (const gunichar2  *str,
   * Convert a string from UTF-16 to UCS-4. The result will be
   * nul-terminated.
   * 
- * Returns: a pointer to a newly allocated UCS-4 string.
+ * Returns: (transfer full): a pointer to a newly allocated UCS-4 string.
   *     This value must be freed with g_free(). If an error occurs,
   *     %NULL will be returned and @error set.
   */
@@ -1270,7 +1284,7 @@ g_utf16_to_ucs4 (const gunichar2  *str,
   * Convert a string from UTF-8 to UTF-16. A 0 character will be
   * added to the result after the converted text.
   *
- * Returns: a pointer to a newly allocated UTF-16 string.
+ * Returns: (transfer full): a pointer to a newly allocated UTF-16 string.
   *     This value must be freed with g_free(). If an error occurs,
   *     %NULL will be returned and @error set.
   */
@@ -1386,7 +1400,7 @@ g_utf8_to_utf16 (const gchar *str,
   * Convert a string from UCS-4 to UTF-16. A 0 character will be
   * added to the result after the converted text.
   * 
- * Returns: a pointer to a newly allocated UTF-16 string.
+ * Returns: (transfer full): a pointer to a newly allocated UTF-16 string.
   *     This value must be freed with g_free(). If an error occurs,
   *     %NULL will be returned and @error set.
   */
@@ -1627,7 +1641,7 @@ fast_validate_len (const char *str,
   * g_utf8_validate:
   * @str: (array length=max_len) (element-type guint8): a pointer to character data
   * @max_len: max bytes to validate, or -1 to go until NUL
- * @end: (allow-none) (out) (transfer none): return location for end of valid data
+ * @end: (out) (optional) (transfer none): return location for end of valid data
   * 
   * Validates UTF-8 encoded text. @str is the text to validate;
   * if @str is nul-terminated, then @max_len can be -1, otherwise
@@ -1655,16 +1669,48 @@ g_utf8_validate (const char   *str,
  {
    const gchar *p;
  
-  if (max_len < 0)
-    p = fast_validate (str);
+  if (max_len >= 0)
+    return g_utf8_validate_len (str, max_len, end);
+
+  p = fast_validate (str);
+
+  if (end)
+    *end = p;
+
+  if (*p != '\0')
+    return FALSE;
    else
-    p = fast_validate_len (str, max_len);
+    return TRUE;
+}
+
+/**
+ * g_utf8_validate_len:
+ * @str: (array length=max_len) (element-type guint8): a pointer to character data
+ * @max_len: max bytes to validate
+ * @end: (out) (optional) (transfer none): return location for end of valid data
+ *
+ * Validates UTF-8 encoded text.
+ *
+ * As with g_utf8_validate(), but @max_len must be set, and hence this function
+ * will always return %FALSE if any of the bytes of @str are nul.
+ *
+ * Returns: %TRUE if the text was valid UTF-8
+ * Since: 2.60
+ */
+gboolean
+g_utf8_validate_len (const char   *str,
+                     gsize         max_len,
+                     const gchar **end)
+
+{
+  const gchar *p;
+
+  p = fast_validate_len (str, max_len);
  
    if (end)
      *end = p;
  
-  if ((max_len >= 0 && p != str + max_len) ||
-      (max_len < 0 && *p != '\0'))
+  if (p != str + max_len)
      return FALSE;
    else
      return TRUE;
@@ -1706,7 +1752,7 @@ g_unichar_validate (gunichar ch)
   * newly-allocated memory, which should be freed with g_free() when
   * no longer needed. 
   *
- * Returns: a newly-allocated string which is the reverse of @str
+ * Returns: (transfer full): a newly-allocated string which is the reverse of @str
   *
   * Since: 2.2
   */
@@ -1738,6 +1784,8 @@ g_utf8_strreverse (const gchar *str,
  /**
   * g_utf8_make_valid:
   * @str: string to coerce into UTF-8
+ * @len: the maximum length of @str to use, in bytes. If @len < 0,
+ *     then the string is nul-terminated.
   *
   * If the provided string is valid UTF-8, return a copy of it. If not,
   * return a copy in which bytes that could not be interpreted as valid Unicode
@@ -1754,17 +1802,21 @@ g_utf8_strreverse (const gchar *str,
   * Since: 2.52
   */
  gchar *
-g_utf8_make_valid (const gchar *str)
+g_utf8_make_valid (const gchar *str,
+                   gssize       len)
  {
    GString *string;
    const gchar *remainder, *invalid;
-  gint remaining_bytes, valid_bytes;
+  gsize remaining_bytes, valid_bytes;
  
    g_return_val_if_fail (str != NULL, NULL);
  
+  if (len < 0)
+    len = strlen (str);
+
    string = NULL;
    remainder = str;
-  remaining_bytes = strlen (str);
+  remaining_bytes = len;
  
    while (remaining_bytes != 0) 
      {
@@ -1784,11 +1836,12 @@ g_utf8_make_valid (const gchar *str)
      }
    
    if (string == NULL)
-    return g_strdup (str);
+    return g_strndup (str, len);
    
-  g_string_append (string, remainder);
+  g_string_append_len (string, remainder, remaining_bytes);
+  g_string_append_c (string, '\0');
  
    g_assert (g_utf8_validate (string->str, -1, NULL));
-  
+
    return g_string_free (string, FALSE);
  }