From 768483871f7d05689a92ec84d2182a1b6e3c0516 Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Sat, 11 Feb 2012 14:20:56 -0700 Subject: [PATCH] Deprecate is_utf8_char() This function assumes that there is enough space in the buffer to read however many bytes are indicated by the first byte in the alleged UTF-8 encoded string. This may not be true, and so it can read beyond the buffer end. is_utf8_char_buf() should be used instead. --- embed.fnc | 2 +- proto.h | 1 + utf8.c | 10 +++++++--- 3 files changed, 9 insertions(+), 4 deletions(-) diff --git a/embed.fnc b/embed.fnc index 892a719..34aa251 100644 --- a/embed.fnc +++ b/embed.fnc @@ -626,7 +626,7 @@ ApPR |bool |is_uni_print_lc|UV c ApPR |bool |is_uni_punct_lc|UV c ApPR |bool |is_uni_xdigit_lc|UV c Anpd |bool |is_ascii_string|NN const U8 *s|STRLEN len -Anpd |STRLEN |is_utf8_char |NN const U8 *s +AnpdD |STRLEN |is_utf8_char |NN const U8 *s Anpd |STRLEN |is_utf8_char_buf|NN const U8 *buf|NN const U8 *buf_end Anpd |bool |is_utf8_string |NN const U8 *s|STRLEN len Anpdmb |bool |is_utf8_string_loc|NN const U8 *s|STRLEN len|NULLOK const U8 **p diff --git a/proto.h b/proto.h index dde1a43..84bfbf4 100644 --- a/proto.h +++ b/proto.h @@ -1819,6 +1819,7 @@ PERL_CALLCONV bool Perl_is_utf8_ascii(pTHX_ const U8 *p) assert(p) PERL_CALLCONV STRLEN Perl_is_utf8_char(const U8 *s) + __attribute__deprecated__ __attribute__nonnull__(1); #define PERL_ARGS_ASSERT_IS_UTF8_CHAR \ assert(s) diff --git a/utf8.c b/utf8.c index 2e0429e..5c1f7c0 100644 --- a/utf8.c +++ b/utf8.c @@ -355,21 +355,25 @@ Perl_is_utf8_char_buf(const U8 *buf, const U8* buf_end) /* =for apidoc is_utf8_char +DEPRECATED! + Tests if some arbitrary number of bytes begins in a valid UTF-8 character. Note that an INVARIANT (i.e. ASCII on non-EBCDIC machines) character is a valid UTF-8 character. The actual number of bytes in the UTF-8 character will be returned if it is valid, otherwise 0. -WARNING: use only if you *know* that C has at least either UTF8_MAXBYTES or -UTF8SKIP(s) bytes. +This function is deprecated due to the possibility that malformed input could +cause reading beyond the end of the input buffer. Use C +instead. =cut */ + STRLEN Perl_is_utf8_char(const U8 *s) { PERL_ARGS_ASSERT_IS_UTF8_CHAR; - /* Assumes we have enough space */ + /* Assumes we have enough space, which is why this is deprecated */ return is_utf8_char_buf(s, s + UTF8SKIP(s)); } -- 2.7.4