* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* is made to see if the character found is actually valid other than
* it starts with an appropriate byte.
*
- * Return value: a pointer to the found character or %NULL.
+ * Returns: (transfer none) (nullable): a pointer to the found character or %NULL.
*/
gchar *
g_utf8_find_prev_char (const char *str,
/**
* g_utf8_find_next_char:
* @p: a pointer to a position within a UTF-8 encoded string
- * @end: a pointer to the byte following the end of the string,
+ * @end: (nullable): a pointer to the byte following the end of the string,
* or %NULL to indicate that the string is nul-terminated
*
* Finds the start of the next UTF-8 character in the string after @p.
* is made to see if the character found is actually valid other than
* it starts with an appropriate byte.
*
- * Return value: a pointer to the found character or %NULL
+ * If @end is %NULL, the return value will never be %NULL: if the end of the
+ * string is reached, a pointer to the terminating nul byte is returned. If
+ * @end is non-%NULL, the return value will be %NULL if the end of the string
+ * is reached.
+ *
+ * Returns: (transfer none) (nullable): a pointer to the found character or %NULL if @end is
+ * set and is reached
*/
gchar *
g_utf8_find_next_char (const gchar *p,
const gchar *end)
{
- if (*p)
+ if (end)
{
- if (end)
- for (++p; p < end && (*p & 0xc0) == 0x80; ++p)
- ;
- else
- for (++p; (*p & 0xc0) == 0x80; ++p)
- ;
+ for (++p; p < end && (*p & 0xc0) == 0x80; ++p)
+ ;
+ return (p >= end) ? NULL : (gchar *)p;
+ }
+ else
+ {
+ for (++p; (*p & 0xc0) == 0x80; ++p)
+ ;
+ return (gchar *)p;
}
- return (p == end) ? NULL : (gchar *)p;
}
/**
* it starts with an appropriate byte. If @p might be the first
* character of the string, you must use g_utf8_find_prev_char() instead.
*
- * Return value: a pointer to the found character
+ * Returns: (transfer none) (not nullable): a pointer to the found character
*/
gchar *
g_utf8_prev_char (const gchar *p)
* the terminating nul character. If the @max'th byte falls in the
* middle of a character, the last (partial) character is not counted.
*
- * Return value: the length of the string in characters
+ * Returns: the length of the string in characters
*/
glong
g_utf8_strlen (const gchar *p,
* Copies a substring out of a UTF-8 encoded string.
* The substring will contain @end_pos - @start_pos characters.
*
- * Returns: a newly allocated copy of the requested
+ * Returns: (transfer full): a newly allocated copy of the requested
* substring. Free with g_free() when no longer needed.
*
* Since: 2.30
* valid Unicode characters, you should use g_utf8_get_char_validated()
* instead.
*
- * Return value: the resulting character
+ * Returns: the resulting character
*/
gunichar
g_utf8_get_char (const gchar *p)
* This limitation exists as this function is called frequently during
* text rendering and therefore has to be as fast as possible.
*
- * Return value: the resulting pointer
+ * Returns: (transfer none): the resulting pointer
*/
gchar *
g_utf8_offset_to_pointer (const gchar *str,
* Since 2.10, this function allows @pos to be before @str, and returns
* a negative offset in this case.
*
- * Return value: the resulting character offset
+ * Returns: the resulting character offset
*/
glong
g_utf8_pointer_to_offset (const gchar *str,
/**
* g_utf8_strncpy:
- * @dest: buffer to fill with characters from @src
+ * @dest: (transfer none): buffer to fill with characters from @src
* @src: UTF-8 encoded string
* @n: character count
*
* must be valid UTF-8 encoded text. (Use g_utf8_validate() on all
* text before trying to use UTF-8 utility functions with it.)
*
- * Return value: @dest
+ * Note you must ensure @dest is at least 4 * @n to fit the
+ * largest possible UTF-8 characters
+ *
+ * Returns: (transfer none): @dest
*/
gchar *
g_utf8_strncpy (gchar *dest,
/**
* g_unichar_to_utf8:
* @c: a Unicode character code
- * @outbuf: output buffer, must have at least 6 bytes of space.
- * If %NULL, the length will be computed and returned
- * and nothing will be written to @outbuf.
+ * @outbuf: (out caller-allocates) (optional): output buffer, must have at
+ * least 6 bytes of space. If %NULL, the length will be computed and
+ * returned and nothing will be written to @outbuf.
*
* Converts a single character to UTF-8.
*
- * Return value: number of bytes written
+ * Returns: number of bytes written
*/
int
g_unichar_to_utf8 (gunichar c,
* in a UTF-8 encoded string, while limiting the search to @len bytes.
* If @len is -1, allow unbounded search.
*
- * Return value: %NULL if the string does not contain the character,
+ * Returns: (transfer none) (nullable): %NULL if the string does not contain the character,
* otherwise, a pointer to the start of the leftmost occurrence
* of the character in the string.
*/
* in a UTF-8 encoded string, while limiting the search to @len bytes.
* If @len is -1, allow unbounded search.
*
- * Return value: %NULL if the string does not contain the character,
+ * Returns: (transfer none) (nullable): %NULL if the string does not contain the character,
* otherwise, a pointer to the start of the rightmost occurrence
* of the character in the string.
*/
guint i, len;
gunichar min_code;
gunichar wc = (guchar) *p;
+ const gunichar partial_sequence = (gunichar) -2;
+ const gunichar malformed_sequence = (gunichar) -1;
if (wc < 0x80)
{
}
else if (G_UNLIKELY (wc < 0xc0))
{
- return (gunichar)-1;
+ return malformed_sequence;
}
else if (wc < 0xe0)
{
}
else
{
- return (gunichar)-1;
+ return malformed_sequence;
}
if (G_UNLIKELY (max_len >= 0 && len > max_len))
for (i = 1; i < max_len; i++)
{
if ((((guchar *)p)[i] & 0xc0) != 0x80)
- return (gunichar)-1;
+ return malformed_sequence;
}
- return (gunichar)-2;
+ return partial_sequence;
}
for (i = 1; i < len; ++i)
if (G_UNLIKELY ((ch & 0xc0) != 0x80))
{
if (ch)
- return (gunichar)-1;
+ return malformed_sequence;
else
- return (gunichar)-2;
+ return partial_sequence;
}
wc <<= 6;
}
if (G_UNLIKELY (wc < min_code))
- return (gunichar)-1;
+ return malformed_sequence;
return wc;
}
/**
* g_utf8_get_char_validated:
* @p: a pointer to Unicode character encoded as UTF-8
- * @max_len: the maximum number of bytes to read, or -1, for no maximum or
- * if @p is nul-terminated
- *
+ * @max_len: the maximum number of bytes to read, or -1 if @p is nul-terminated
+ *
* Convert a sequence of bytes encoded as UTF-8 to a Unicode character.
* This function checks for incomplete characters, for invalid characters
* such as characters that are out of the range of Unicode, and for
* overlong encodings of valid characters.
+ *
+ * Note that g_utf8_get_char_validated() returns (gunichar)-2 if
+ * @max_len is positive and any of the bytes in the first UTF-8 character
+ * sequence are nul.
*
- * Return value: the resulting character. If @p points to a partial
+ * Returns: the resulting character. If @p points to a partial
* sequence at the end of a string that could begin a valid
* character (or if @max_len is zero), returns (gunichar)-2;
* otherwise, if @p does not point to a valid UTF-8 encoded
return result;
}
+#define CONT_BYTE_FAST(p) ((guchar)*p++ & 0x3f)
+
/**
* g_utf8_to_ucs4_fast:
* @str: a UTF-8 encoded string
* @len: the maximum length of @str to use, in bytes. If @len < 0,
* then the string is nul-terminated.
- * @items_written: (allow-none): location to store the number of
- * characters in the result, or %NULL.
+ * @items_written: (out caller-allocates) (optional): location to store the
+ * number of characters in the result, or %NULL.
*
* Convert a string from UTF-8 to a 32-bit fixed width
* representation as UCS-4, assuming valid UTF-8 input.
* but does no error checking on the input. A trailing 0 character
* will be added to the string after the converted text.
*
- * Return value: a pointer to a newly allocated UCS-4 string.
+ * Returns: (transfer full): a pointer to a newly allocated UCS-4 string.
* This value must be freed with g_free().
*/
gunichar *
p = str;
for (i=0; i < n_chars; i++)
{
- gunichar wc = (guchar)*p++;
+ guchar first = (guchar)*p++;
+ gunichar wc;
- if (wc < 0x80)
+ if (first < 0xc0)
{
- result[i] = wc;
+ /* We really hope first < 0x80, but we don't want to test an
+ * extra branch for invalid input, which this function
+ * does not care about. Handling unexpected continuation bytes
+ * here will do the least damage. */
+ wc = first;
}
else
- {
- gunichar mask = 0x40;
-
- if (G_UNLIKELY ((wc & mask) == 0))
- {
- /* It's an out-of-sequence 10xxxxxxx byte.
- * Rather than making an ugly hash of this and the next byte
- * and overrunning the buffer, it's more useful to treat it
- * with a replacement character
- */
- result[i] = 0xfffd;
- continue;
- }
-
- do
- {
- wc <<= 6;
- wc |= (guchar)(*p++) & 0x3f;
- mask <<= 5;
- }
- while((wc & mask) != 0);
-
- wc &= mask - 1;
-
- result[i] = wc;
+ {
+ gunichar c1 = CONT_BYTE_FAST(p);
+ if (first < 0xe0)
+ {
+ wc = ((first & 0x1f) << 6) | c1;
+ }
+ else
+ {
+ gunichar c2 = CONT_BYTE_FAST(p);
+ if (first < 0xf0)
+ {
+ wc = ((first & 0x0f) << 12) | (c1 << 6) | c2;
+ }
+ else
+ {
+ gunichar c3 = CONT_BYTE_FAST(p);
+ wc = ((first & 0x07) << 18) | (c1 << 12) | (c2 << 6) | c3;
+ if (G_UNLIKELY (first >= 0xf8))
+ {
+ /* This can't be valid UTF-8, but g_utf8_next_char()
+ * and company allow out-of-range sequences */
+ gunichar mask = 1 << 20;
+ while ((wc & mask) != 0)
+ {
+ wc <<= 6;
+ wc |= CONT_BYTE_FAST(p);
+ mask <<= 5;
+ }
+ wc &= mask - 1;
+ }
+ }
+ }
}
+ result[i] = wc;
}
result[i] = 0;
* @str: a UTF-8 encoded string
* @len: the maximum length of @str to use, in bytes. If @len < 0,
* then the string is nul-terminated.
- * @items_read: (allow-none): location to store number of bytes read, or %NULL.
+ * @items_read: (out caller-allocates) (optional): location to store number of
+ * bytes read, or %NULL.
* If %NULL, then %G_CONVERT_ERROR_PARTIAL_INPUT will be
* returned in case @str contains a trailing partial
* character. If an error occurs then the index of the
* invalid input is stored here.
- * @items_written: (allow-none): location to store number of characters
- * written or %NULL. The value here stored does not include the
- * trailing 0 character.
+ * @items_written: (out caller-allocates) (optional): location to store number
+ * of characters written or %NULL. The value here stored does not include
+ * the trailing 0 character.
* @error: location to store the error occurring, or %NULL to ignore
* errors. Any of the errors in #GConvertError other than
* %G_CONVERT_ERROR_NO_CONVERSION may occur.
* representation as UCS-4. A trailing 0 character will be added to the
* string after the converted text.
*
- * Return value: a pointer to a newly allocated UCS-4 string.
+ * Returns: (transfer full): a pointer to a newly allocated UCS-4 string.
* This value must be freed with g_free(). If an error occurs,
* %NULL will be returned and @error set.
*/
* @str: a UCS-4 encoded string
* @len: the maximum length (number of characters) of @str to use.
* If @len < 0, then the string is nul-terminated.
- * @items_read: (allow-none): location to store number of characters
- * read, or %NULL.
- * @items_written: (allow-none): location to store number of bytes
- * written or %NULL. The value here stored does not include the
- * trailing 0 byte.
+ * @items_read: (out caller-allocates) (optional): location to store number of
+ * characters read, or %NULL.
+ * @items_written: (out caller-allocates) (optional): location to store number
+ * of bytes written or %NULL. The value here stored does not include the
+ * trailing 0 byte.
* @error: location to store the error occurring, or %NULL to ignore
* errors. Any of the errors in #GConvertError other than
* %G_CONVERT_ERROR_NO_CONVERSION may occur.
* Convert a string from a 32-bit fixed width representation as UCS-4.
* to UTF-8. The result will be terminated with a 0 byte.
*
- * Return value: a pointer to a newly allocated UTF-8 string.
+ * Returns: (transfer full): a pointer to a newly allocated UTF-8 string.
* This value must be freed with g_free(). If an error occurs,
* %NULL will be returned and @error set. In that case, @items_read
* will be set to the position of the first invalid input character.
* @str: a UTF-16 encoded string
* @len: the maximum length (number of #gunichar2) of @str to use.
* If @len < 0, then the string is nul-terminated.
- * @items_read: (allow-none): location to store number of words read,
- * or %NULL. If %NULL, then %G_CONVERT_ERROR_PARTIAL_INPUT will be
- * returned in case @str contains a trailing partial character. If
+ * @items_read: (out caller-allocates) (optional): location to store number of
+ * words read, or %NULL. If %NULL, then %G_CONVERT_ERROR_PARTIAL_INPUT will
+ * be returned in case @str contains a trailing partial character. If
* an error occurs then the index of the invalid input is stored here.
- * @items_written: (allow-none): location to store number of bytes written,
- * or %NULL. The value stored here does not include the trailing 0 byte.
+ * @items_written: (out caller-allocates) (optional): location to store number
+ * of bytes written, or %NULL. The value stored here does not include the
+ * trailing 0 byte.
* @error: location to store the error occurring, or %NULL to ignore
* errors. Any of the errors in #GConvertError other than
* %G_CONVERT_ERROR_NO_CONVERSION may occur.
* be correctly interpreted as UTF-16, i.e. it doesn't contain
* things unpaired surrogates.
*
- * Return value: a pointer to a newly allocated UTF-8 string.
+ * Returns: (transfer full): a pointer to a newly allocated UTF-8 string.
* This value must be freed with g_free(). If an error occurs,
* %NULL will be returned and @error set.
**/
* @str: a UTF-16 encoded string
* @len: the maximum length (number of #gunichar2) of @str to use.
* If @len < 0, then the string is nul-terminated.
- * @items_read: (allow-none): location to store number of words read,
- * or %NULL. If %NULL, then %G_CONVERT_ERROR_PARTIAL_INPUT will be
- * returned in case @str contains a trailing partial character. If
+ * @items_read: (out caller-allocates) (optional): location to store number of
+ * words read, or %NULL. If %NULL, then %G_CONVERT_ERROR_PARTIAL_INPUT will
+ * be returned in case @str contains a trailing partial character. If
* an error occurs then the index of the invalid input is stored here.
- * @items_written: (allow-none): location to store number of characters
- * written, or %NULL. The value stored here does not include the trailing
- * 0 character.
+ * @items_written: (out caller-allocates) (optional): location to store number
+ * of characters written, or %NULL. The value stored here does not include
+ * the trailing 0 character.
* @error: location to store the error occurring, or %NULL to ignore
* errors. Any of the errors in #GConvertError other than
* %G_CONVERT_ERROR_NO_CONVERSION may occur.
* Convert a string from UTF-16 to UCS-4. The result will be
* nul-terminated.
*
- * Return value: a pointer to a newly allocated UCS-4 string.
+ * Returns: (transfer full): a pointer to a newly allocated UCS-4 string.
* This value must be freed with g_free(). If an error occurs,
* %NULL will be returned and @error set.
*/
* @str: a UTF-8 encoded string
* @len: the maximum length (number of bytes) of @str to use.
* If @len < 0, then the string is nul-terminated.
- * @items_read: (allow-none): location to store number of bytes read,
- * or %NULL. If %NULL, then %G_CONVERT_ERROR_PARTIAL_INPUT will be
- * returned in case @str contains a trailing partial character. If
+ * @items_read: (out caller-allocates) (optional): location to store number of
+ * bytes read, or %NULL. If %NULL, then %G_CONVERT_ERROR_PARTIAL_INPUT will
+ * be returned in case @str contains a trailing partial character. If
* an error occurs then the index of the invalid input is stored here.
- * @items_written: (allow-none): location to store number of #gunichar2
- * written, or %NULL. The value stored here does not include the
- * trailing 0.
+ * @items_written: (out caller-allocates) (optional): location to store number
+ * of #gunichar2 written, or %NULL. The value stored here does not include
+ * the trailing 0.
* @error: location to store the error occurring, or %NULL to ignore
* errors. Any of the errors in #GConvertError other than
* %G_CONVERT_ERROR_NO_CONVERSION may occur.
* Convert a string from UTF-8 to UTF-16. A 0 character will be
* added to the result after the converted text.
*
- * Return value: a pointer to a newly allocated UTF-16 string.
+ * Returns: (transfer full): a pointer to a newly allocated UTF-16 string.
* This value must be freed with g_free(). If an error occurs,
* %NULL will be returned and @error set.
*/
* @str: a UCS-4 encoded string
* @len: the maximum length (number of characters) of @str to use.
* If @len < 0, then the string is nul-terminated.
- * @items_read: (allow-none): location to store number of bytes read,
- * or %NULL. If an error occurs then the index of the invalid input
- * is stored here.
- * @items_written: (allow-none): location to store number of #gunichar2
- * written, or %NULL. The value stored here does not include the
- * trailing 0.
+ * @items_read: (out caller-allocates) (optional): location to store number of
+ * bytes read, or %NULL. If an error occurs then the index of the invalid
+ * input is stored here.
+ * @items_written: (out caller-allocates) (optional): location to store number
+ * of #gunichar2 written, or %NULL. The value stored here does not include
+ * the trailing 0.
* @error: location to store the error occurring, or %NULL to ignore
* errors. Any of the errors in #GConvertError other than
* %G_CONVERT_ERROR_NO_CONVERSION may occur.
* Convert a string from UCS-4 to UTF-16. A 0 character will be
* added to the result after the converted text.
*
- * Return value: a pointer to a newly allocated UTF-16 string.
+ * Returns: (transfer full): a pointer to a newly allocated UTF-16 string.
* This value must be freed with g_free(). If an error occurs,
* %NULL will be returned and @error set.
*/
return result;
}
-#define CONTINUATION_CHAR \
- G_STMT_START { \
- if ((*(guchar *)p & 0xc0) != 0x80) /* 10xxxxxx */ \
- goto error; \
- val <<= 6; \
- val |= (*(guchar *)p) & 0x3f; \
- } G_STMT_END
+#define VALIDATE_BYTE(mask, expect) \
+ G_STMT_START { \
+ if (G_UNLIKELY((*(guchar *)p & (mask)) != (expect))) \
+ goto error; \
+ } G_STMT_END
+
+/* see IETF RFC 3629 Section 4 */
static const gchar *
fast_validate (const char *str)
{
- gunichar val = 0;
- gunichar min = 0;
const gchar *p;
for (p = str; *p; p++)
else
{
const gchar *last;
-
+
last = p;
- if ((*(guchar *)p & 0xe0) == 0xc0) /* 110xxxxx */
+ if (*(guchar *)p < 0xe0) /* 110xxxxx */
{
- if (G_UNLIKELY ((*(guchar *)p & 0x1e) == 0))
- goto error;
- p++;
- if (G_UNLIKELY ((*(guchar *)p & 0xc0) != 0x80)) /* 10xxxxxx */
+ if (G_UNLIKELY (*(guchar *)p < 0xc2))
goto error;
}
else
{
- if ((*(guchar *)p & 0xf0) == 0xe0) /* 1110xxxx */
+ if (*(guchar *)p < 0xf0) /* 1110xxxx */
{
- min = (1 << 11);
- val = *(guchar *)p & 0x0f;
- goto TWO_REMAINING;
+ switch (*(guchar *)p++ & 0x0f)
+ {
+ case 0:
+ VALIDATE_BYTE(0xe0, 0xa0); /* 0xa0 ... 0xbf */
+ break;
+ case 0x0d:
+ VALIDATE_BYTE(0xe0, 0x80); /* 0x80 ... 0x9f */
+ break;
+ default:
+ VALIDATE_BYTE(0xc0, 0x80); /* 10xxxxxx */
+ }
}
- else if ((*(guchar *)p & 0xf8) == 0xf0) /* 11110xxx */
+ else if (*(guchar *)p < 0xf5) /* 11110xxx excluding out-of-range */
{
- min = (1 << 16);
- val = *(guchar *)p & 0x07;
+ switch (*(guchar *)p++ & 0x07)
+ {
+ case 0:
+ VALIDATE_BYTE(0xc0, 0x80); /* 10xxxxxx */
+ if (G_UNLIKELY((*(guchar *)p & 0x30) == 0))
+ goto error;
+ break;
+ case 4:
+ VALIDATE_BYTE(0xf0, 0x80); /* 0x80 ... 0x8f */
+ break;
+ default:
+ VALIDATE_BYTE(0xc0, 0x80); /* 10xxxxxx */
+ }
+ p++;
+ VALIDATE_BYTE(0xc0, 0x80); /* 10xxxxxx */
}
else
goto error;
-
- p++;
- CONTINUATION_CHAR;
- TWO_REMAINING:
- p++;
- CONTINUATION_CHAR;
- p++;
- CONTINUATION_CHAR;
-
- if (G_UNLIKELY (val < min))
- goto error;
+ }
+
+ p++;
+ VALIDATE_BYTE(0xc0, 0x80); /* 10xxxxxx */
- if (G_UNLIKELY (!UNICODE_VALID(val)))
- goto error;
- }
-
continue;
-
+
error:
return last;
}
gssize max_len)
{
- gunichar val = 0;
- gunichar min = 0;
const gchar *p;
g_assert (max_len >= 0);
else
{
const gchar *last;
-
+
last = p;
- if ((*(guchar *)p & 0xe0) == 0xc0) /* 110xxxxx */
+ if (*(guchar *)p < 0xe0) /* 110xxxxx */
{
if (G_UNLIKELY (max_len - (p - str) < 2))
goto error;
- if (G_UNLIKELY ((*(guchar *)p & 0x1e) == 0))
- goto error;
- p++;
- if (G_UNLIKELY ((*(guchar *)p & 0xc0) != 0x80)) /* 10xxxxxx */
+ if (G_UNLIKELY (*(guchar *)p < 0xc2))
goto error;
}
else
{
- if ((*(guchar *)p & 0xf0) == 0xe0) /* 1110xxxx */
+ if (*(guchar *)p < 0xf0) /* 1110xxxx */
{
if (G_UNLIKELY (max_len - (p - str) < 3))
goto error;
-
- min = (1 << 11);
- val = *(guchar *)p & 0x0f;
- goto TWO_REMAINING;
+
+ switch (*(guchar *)p++ & 0x0f)
+ {
+ case 0:
+ VALIDATE_BYTE(0xe0, 0xa0); /* 0xa0 ... 0xbf */
+ break;
+ case 0x0d:
+ VALIDATE_BYTE(0xe0, 0x80); /* 0x80 ... 0x9f */
+ break;
+ default:
+ VALIDATE_BYTE(0xc0, 0x80); /* 10xxxxxx */
+ }
}
- else if ((*(guchar *)p & 0xf8) == 0xf0) /* 11110xxx */
+ else if (*(guchar *)p < 0xf5) /* 11110xxx excluding out-of-range */
{
if (G_UNLIKELY (max_len - (p - str) < 4))
goto error;
-
- min = (1 << 16);
- val = *(guchar *)p & 0x07;
+
+ switch (*(guchar *)p++ & 0x07)
+ {
+ case 0:
+ VALIDATE_BYTE(0xc0, 0x80); /* 10xxxxxx */
+ if (G_UNLIKELY((*(guchar *)p & 0x30) == 0))
+ goto error;
+ break;
+ case 4:
+ VALIDATE_BYTE(0xf0, 0x80); /* 0x80 ... 0x8f */
+ break;
+ default:
+ VALIDATE_BYTE(0xc0, 0x80); /* 10xxxxxx */
+ }
+ p++;
+ VALIDATE_BYTE(0xc0, 0x80); /* 10xxxxxx */
}
else
goto error;
-
- p++;
- CONTINUATION_CHAR;
- TWO_REMAINING:
- p++;
- CONTINUATION_CHAR;
- p++;
- CONTINUATION_CHAR;
-
- if (G_UNLIKELY (val < min))
- goto error;
- if (G_UNLIKELY (!UNICODE_VALID(val)))
- goto error;
- }
-
+ }
+
+ p++;
+ VALIDATE_BYTE(0xc0, 0x80); /* 10xxxxxx */
+
continue;
-
+
error:
return last;
}
* g_utf8_validate:
* @str: (array length=max_len) (element-type guint8): a pointer to character data
* @max_len: max bytes to validate, or -1 to go until NUL
- * @end: (allow-none) (out) (transfer none): return location for end of valid data
+ * @end: (out) (optional) (transfer none): return location for end of valid data
*
* Validates UTF-8 encoded text. @str is the text to validate;
* if @str is nul-terminated, then @max_len can be -1, otherwise
* or the network should be checked with g_utf8_validate() before
* doing anything else with it.
*
- * Return value: %TRUE if the text was valid UTF-8
+ * Returns: %TRUE if the text was valid UTF-8
*/
gboolean
g_utf8_validate (const char *str,
{
const gchar *p;
- if (max_len < 0)
- p = fast_validate (str);
+ if (max_len >= 0)
+ return g_utf8_validate_len (str, max_len, end);
+
+ p = fast_validate (str);
+
+ if (end)
+ *end = p;
+
+ if (*p != '\0')
+ return FALSE;
else
- p = fast_validate_len (str, max_len);
+ return TRUE;
+}
+
+/**
+ * g_utf8_validate_len:
+ * @str: (array length=max_len) (element-type guint8): a pointer to character data
+ * @max_len: max bytes to validate
+ * @end: (out) (optional) (transfer none): return location for end of valid data
+ *
+ * Validates UTF-8 encoded text.
+ *
+ * As with g_utf8_validate(), but @max_len must be set, and hence this function
+ * will always return %FALSE if any of the bytes of @str are nul.
+ *
+ * Returns: %TRUE if the text was valid UTF-8
+ * Since: 2.60
+ */
+gboolean
+g_utf8_validate_len (const char *str,
+ gsize max_len,
+ const gchar **end)
+
+{
+ const gchar *p;
+
+ p = fast_validate_len (str, max_len);
if (end)
*end = p;
- if ((max_len >= 0 && p != str + max_len) ||
- (max_len < 0 && *p != '\0'))
+ if (p != str + max_len)
return FALSE;
else
return TRUE;
* integer values of @ch will not be valid. 0 is considered a valid
* character, though it's normally a string terminator.
*
- * Return value: %TRUE if @ch is a valid Unicode character
+ * Returns: %TRUE if @ch is a valid Unicode character
**/
gboolean
g_unichar_validate (gunichar ch)
* newly-allocated memory, which should be freed with g_free() when
* no longer needed.
*
- * Returns: a newly-allocated string which is the reverse of @str
+ * Returns: (transfer full): a newly-allocated string which is the reverse of @str
*
* Since: 2.2
*/
return result;
}
-
+/**
+ * g_utf8_make_valid:
+ * @str: string to coerce into UTF-8
+ * @len: the maximum length of @str to use, in bytes. If @len < 0,
+ * then the string is nul-terminated.
+ *
+ * If the provided string is valid UTF-8, return a copy of it. If not,
+ * return a copy in which bytes that could not be interpreted as valid Unicode
+ * are replaced with the Unicode replacement character (U+FFFD).
+ *
+ * For example, this is an appropriate function to use if you have received
+ * a string that was incorrectly declared to be UTF-8, and you need a valid
+ * UTF-8 version of it that can be logged or displayed to the user, with the
+ * assumption that it is close enough to ASCII or UTF-8 to be mostly
+ * readable as-is.
+ *
+ * Returns: (transfer full): a valid UTF-8 string whose content resembles @str
+ *
+ * Since: 2.52
+ */
gchar *
-_g_utf8_make_valid (const gchar *name)
+g_utf8_make_valid (const gchar *str,
+ gssize len)
{
GString *string;
const gchar *remainder, *invalid;
- gint remaining_bytes, valid_bytes;
+ gsize remaining_bytes, valid_bytes;
- g_return_val_if_fail (name != NULL, NULL);
+ g_return_val_if_fail (str != NULL, NULL);
+
+ if (len < 0)
+ len = strlen (str);
string = NULL;
- remainder = name;
- remaining_bytes = strlen (name);
+ remainder = str;
+ remaining_bytes = len;
while (remaining_bytes != 0)
{
}
if (string == NULL)
- return g_strdup (name);
+ return g_strndup (str, len);
- g_string_append (string, remainder);
+ g_string_append_len (string, remainder, remaining_bytes);
+ g_string_append_c (string, '\0');
g_assert (g_utf8_validate (string->str, -1, NULL));
-
+
return g_string_free (string, FALSE);
}