From 8850bf83ea1d396a0a5bb6dd7e4d4b9556496409 Mon Sep 17 00:00:00 2001 From: Jarkko Hietaniemi Date: Fri, 8 Dec 2000 03:19:03 +0000 Subject: [PATCH] Use the UTF8 macros a bit. They can't be used with abandon everywhere because we do generate illegal UTF-8 in some situations. This is of course naughty. p4raw-id: //depot/perl@8033 --- pod/perlapi.pod | 15 ++++++++------- utf8.c | 29 ++++++++++++++++++++--------- utf8.h | 5 +++++ 3 files changed, 33 insertions(+), 16 deletions(-) diff --git a/pod/perlapi.pod b/pod/perlapi.pod index 50ac40c..7fdc55e 100644 --- a/pod/perlapi.pod +++ b/pod/perlapi.pod @@ -3234,11 +3234,12 @@ Found in file utf8.c =item utf8_hop -Move the C pointing to UTF-8 data by C characters, either forward -or backward. +Return the UTF-8 pointer C displaced by C characters, either +forward or backward. WARNING: do not use the following unless you *know* C is within -the UTF-8 buffer pointed to by C. +the UTF-8 data pointed to by C *and* that on entry C is aligned +on the first byte of character or just after the last byte of a character. U8* utf8_hop(U8 *s, I32 off) @@ -3279,10 +3280,10 @@ If C does not point to a well-formed UTF8 character, the behaviour is dependent on the value of C: if it contains UTF8_CHECK_ONLY, it is assumed that the caller will raise a warning, and this function will set C to C<-1> and return zero. If the C does not -contain UTF8_CHECK_ONLY, the UNICODE_REPLACEMENT_CHARACTER (0xFFFD) -will be returned, and C will be set to the expected length of -the UTF-8 character in bytes. The C can also contain various -flags to allow deviations from the strict UTF-8 encoding (see F). +contain UTF8_CHECK_ONLY, the UNICODE_REPLACEMENT (0xFFFD) will be +returned, and C will be set to the expected length of the +UTF-8 character in bytes. The C can also contain various flags +to allow deviations from the strict UTF-8 encoding (see F). U8* s utf8_to_uv(STRLEN curlen, STRLEN *retlen, U32 flags) diff --git a/utf8.c b/utf8.c index 98e13e8..7a652b4 100644 --- a/utf8.c +++ b/utf8.c @@ -137,7 +137,7 @@ Perl_is_utf8_char(pTHX_ U8 *s) while (slen--) { if ((*s & 0xc0) != 0x80) return 0; - uv = (uv << 6) | (*s & 0x3f); + uv = UTF8_ACCUMULATE(uv, *s); if (uv < ouv) return 0; ouv = uv; @@ -285,7 +285,7 @@ Perl_utf8_to_uv(pTHX_ U8* s, STRLEN curlen, STRLEN* retlen, U32 flags) goto malformed; } else - uv = (uv << 6) | (*s & 0x3f); + uv = UTF8_ACCUMULATE(uv, *s); if (uv < ouv) { /* This cannot be allowed. */ if (dowarn) @@ -379,6 +379,10 @@ Perl_utf8_length(pTHX_ U8* s, U8* e) { STRLEN len = 0; + /* Note: cannot use UTF8_IS_...() too eagerly here since e.g. + * the bitops (especially ~) can create illegal UTF-8. + * In other words: in Perl UTF-8 is not just for Unicode. */ + if (e < s) Perl_croak(aTHX_ "panic: utf8_length: unexpected end"); while (s < e) { @@ -409,6 +413,10 @@ Perl_utf8_distance(pTHX_ U8 *a, U8 *b) { IV off = 0; + /* Note: cannot use UTF8_IS_...() too eagerly here since e.g. + * the bitops (especially ~) can create illegal UTF-8. + * In other words: in Perl UTF-8 is not just for Unicode. */ + if (a < b) { while (a < b) { U8 c = UTF8SKIP(a); @@ -436,17 +444,22 @@ Perl_utf8_distance(pTHX_ U8 *a, U8 *b) /* =for apidoc Am|U8*|utf8_hop|U8 *s|I32 off -Move the C pointing to UTF-8 data by C characters, either forward -or backward. +Return the UTF-8 pointer C displaced by C characters, either +forward or backward. WARNING: do not use the following unless you *know* C is within -the UTF-8 buffer pointed to by C. +the UTF-8 data pointed to by C *and* that on entry C is aligned +on the first byte of character or just after the last byte of a character. =cut */ U8 * Perl_utf8_hop(pTHX_ U8 *s, I32 off) { + /* Note: cannot use UTF8_IS_...() too eagerly here since e.g + * the bitops (especially ~) can create illegal UTF-8. + * In other words: in Perl UTF-8 is not just for Unicode. */ + if (off >= 0) { while (off--) s += UTF8SKIP(s); @@ -454,10 +467,8 @@ Perl_utf8_hop(pTHX_ U8 *s, I32 off) else { while (off++) { s--; - if (*s & 0x80) { - while ((*s & 0xc0) == 0x80) - s--; - } + while (UTF8_IS_CONTINUATION(*s)) + s--; } } return s; diff --git a/utf8.h b/utf8.h index bafdc57..26ef723 100644 --- a/utf8.h +++ b/utf8.h @@ -65,6 +65,11 @@ END_EXTERN_C #define UTF8_IS_ASCII(c) ((c) < 0x80) #define UTF8_IS_START(c) ((c) >= 0xc0 && ((c) <= 0xfd)) #define UTF8_IS_CONTINUATION(c) ((c) >= 0x80 && ((c) <= 0xbf)) +#define UTF8_IS_CONTINUED(c) ((c) & 0x80) + +#define UTF8_CONTINUATION_MASK 0x3f +#define UTF8_ACCUMULATION_SHIFT 6 +#define UTF8_ACCUMULATE(old, new) ((old) << UTF8_ACCUMULATION_SHIFT | ((new) & UTF8_CONTINUATION_MASK)) #ifdef HAS_QUAD #define UNISKIP(uv) ( (uv) < 0x80 ? 1 : \ -- 2.7.4