From b76347f2eb34c85a0a38543b2f57ca474fedab4d Mon Sep 17 00:00:00 2001 From: Jarkko Hietaniemi Date: Sat, 18 Nov 2000 22:50:28 +0000 Subject: [PATCH] Introduce Perl_utf8_length(). Use it. p4raw-id: //depot/perl@7744 --- embed.h | 4 ++++ embed.pl | 1 + objXSUB.h | 4 ++++ perlapi.c | 7 +++++++ proto.h | 1 + sv.c | 18 ++++++------------ utf8.c | 29 +++++++++++++++++++++++++++++ 7 files changed, 52 insertions(+), 12 deletions(-) diff --git a/embed.h b/embed.h index 7bb132d..1301e3e 100644 --- a/embed.h +++ b/embed.h @@ -725,6 +725,7 @@ #define utilize Perl_utilize #define utf16_to_utf8 Perl_utf16_to_utf8 #define utf16_to_utf8_reversed Perl_utf16_to_utf8_reversed +#define utf8_length Perl_utf8_length #define utf8_distance Perl_utf8_distance #define utf8_hop Perl_utf8_hop #define utf8_to_bytes Perl_utf8_to_bytes @@ -2186,6 +2187,7 @@ #define utilize(a,b,c,d,e) Perl_utilize(aTHX_ a,b,c,d,e) #define utf16_to_utf8(a,b,c,d) Perl_utf16_to_utf8(aTHX_ a,b,c,d) #define utf16_to_utf8_reversed(a,b,c,d) Perl_utf16_to_utf8_reversed(aTHX_ a,b,c,d) +#define utf8_length(a,b) Perl_utf8_length(aTHX_ a,b) #define utf8_distance(a,b) Perl_utf8_distance(aTHX_ a,b) #define utf8_hop(a,b) Perl_utf8_hop(aTHX_ a,b) #define utf8_to_bytes(a,b) Perl_utf8_to_bytes(aTHX_ a,b) @@ -4284,6 +4286,8 @@ #define utf16_to_utf8 Perl_utf16_to_utf8 #define Perl_utf16_to_utf8_reversed CPerlObj::Perl_utf16_to_utf8_reversed #define utf16_to_utf8_reversed Perl_utf16_to_utf8_reversed +#define Perl_utf8_length CPerlObj::Perl_utf8_length +#define utf8_length Perl_utf8_length #define Perl_utf8_distance CPerlObj::Perl_utf8_distance #define utf8_distance Perl_utf8_distance #define Perl_utf8_hop CPerlObj::Perl_utf8_hop diff --git a/embed.pl b/embed.pl index a19c439..b8abef3 100755 --- a/embed.pl +++ b/embed.pl @@ -2070,6 +2070,7 @@ p |void |unshare_hek |HEK* hek p |void |utilize |int aver|I32 floor|OP* version|OP* id|OP* arg Ap |U8* |utf16_to_utf8 |U8* p|U8 *d|I32 bytelen|I32 *newlen Ap |U8* |utf16_to_utf8_reversed|U8* p|U8 *d|I32 bytelen|I32 *newlen +Ap |STRLEN |utf8_length |U8* s|U8 *e Ap |I32 |utf8_distance |U8 *a|U8 *b Ap |U8* |utf8_hop |U8 *s|I32 off ApM |U8* |utf8_to_bytes |U8 *s|STRLEN *len diff --git a/objXSUB.h b/objXSUB.h index 5827b72..88eb400 100644 --- a/objXSUB.h +++ b/objXSUB.h @@ -1853,6 +1853,10 @@ #define Perl_utf16_to_utf8_reversed pPerl->Perl_utf16_to_utf8_reversed #undef utf16_to_utf8_reversed #define utf16_to_utf8_reversed Perl_utf16_to_utf8_reversed +#undef Perl_utf8_length +#define Perl_utf8_length pPerl->Perl_utf8_length +#undef utf8_length +#define utf8_length Perl_utf8_length #undef Perl_utf8_distance #define Perl_utf8_distance pPerl->Perl_utf8_distance #undef utf8_distance diff --git a/perlapi.c b/perlapi.c index a9dd2f0..a2e73e4 100644 --- a/perlapi.c +++ b/perlapi.c @@ -3350,6 +3350,13 @@ Perl_utf16_to_utf8_reversed(pTHXo_ U8* p, U8 *d, I32 bytelen, I32 *newlen) return ((CPerlObj*)pPerl)->Perl_utf16_to_utf8_reversed(p, d, bytelen, newlen); } +#undef Perl_utf8_length +STRLEN +Perl_utf8_length(pTHXo_ U8* s, U8 *e) +{ + return ((CPerlObj*)pPerl)->Perl_utf8_length(s, e); +} + #undef Perl_utf8_distance I32 Perl_utf8_distance(pTHXo_ U8 *a, U8 *b) diff --git a/proto.h b/proto.h index 052346d..91b7f86 100644 --- a/proto.h +++ b/proto.h @@ -805,6 +805,7 @@ PERL_CALLCONV void Perl_unshare_hek(pTHX_ HEK* hek); PERL_CALLCONV void Perl_utilize(pTHX_ int aver, I32 floor, OP* version, OP* id, OP* arg); PERL_CALLCONV U8* Perl_utf16_to_utf8(pTHX_ U8* p, U8 *d, I32 bytelen, I32 *newlen); PERL_CALLCONV U8* Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8 *d, I32 bytelen, I32 *newlen); +PERL_CALLCONV STRLEN Perl_utf8_length(pTHX_ U8* s, U8 *e); PERL_CALLCONV I32 Perl_utf8_distance(pTHX_ U8 *a, U8 *b); PERL_CALLCONV U8* Perl_utf8_hop(pTHX_ U8 *s, I32 off); PERL_CALLCONV U8* Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len); diff --git a/sv.c b/sv.c index 375b956..e193bc5 100644 --- a/sv.c +++ b/sv.c @@ -3994,26 +3994,20 @@ UTF8 bytes as a single character. STRLEN Perl_sv_len_utf8(pTHX_ register SV *sv) { - U8 *s; - U8 *send; - STRLEN len; - if (!sv) return 0; #ifdef NOTYET if (SvGMAGICAL(sv)) - len = mg_length(sv); + return mg_length(sv); else #endif - s = (U8*)SvPV(sv, len); - send = s + len; - len = 0; - while (s < send) { - s += UTF8SKIP(s); - len++; + { + STRLEN len; + U8 *s = (U8*)SvPV(sv, len); + + return Perl_utf8_length(s, s + len); } - return len; } void diff --git a/utf8.c b/utf8.c index f1b80a4..fc625dc 100644 --- a/utf8.c +++ b/utf8.c @@ -353,6 +353,35 @@ Perl_utf8_to_uv_simple(pTHX_ U8* s, STRLEN* retlen) return Perl_utf8_to_uv(aTHX_ s, (STRLEN)-1, retlen, 0); } +/* +=for apidoc|utf8_length|U8 *s|U8 *e + +Return the length of the UTF-8 char encoded string C in characters. +Stops at string C. If C s> or if the scan would end up +past C, return -1. + +=cut +*/ + +STRLEN +Perl_utf8_length(pTHX_ U8* s, U8* e) +{ + STRLEN len = 0; + + if (e < s) + return -1; + while (s < e) { + STRLEN t = UTF8SKIP(s); + + if (e - s < t) + return -1; + s += t; + len++; + } + + return len; +} + /* utf8_distance(a,b) returns the number of UTF8 characters between the pointers a and b */ -- 2.7.4