From dcad28805702d580064bc39a267d63c58bbb3b3f Mon Sep 17 00:00:00 2001 From: Jarkko Hietaniemi Date: Wed, 25 Oct 2000 20:00:48 +0000 Subject: [PATCH] Continue the internal UTF-8 API tweaking. Rename utf8_to_uv_chk() back to utf8_to_uv() because it's used much more than the simpler API, now called utf8_to_uv_simple(). Still not quite happy with API, too much partial duplication of functionality. p4raw-id: //depot/perl@7439 --- doop.c | 23 ++++++++++----------- embed.h | 10 ++++----- embed.pl | 6 +++--- objXSUB.h | 8 ++++---- op.c | 12 +++++------ perlapi.c | 14 ++++++------- pod/perlapi.pod | 44 +++++++++++++++++++-------------------- pod/perlunicode.pod | 5 ----- pp.c | 18 ++++++++-------- pp_ctl.c | 6 +++--- proto.h | 4 ++-- regcomp.c | 6 +++--- regexec.c | 8 ++++---- sv.c | 4 ++-- toke.c | 4 ++-- utf8.c | 59 ++++++++++++++++++++++++++--------------------------- 16 files changed, 112 insertions(+), 119 deletions(-) diff --git a/doop.c b/doop.c index 3cd8f07..fa927c2 100644 --- a/doop.c +++ b/doop.c @@ -77,7 +77,7 @@ S_do_trans_simple(pTHX_ SV *sv) ulen = 1; /* Need to check this, otherwise 128..255 won't match */ - c = utf8_to_uv_chk(s, send - s, &ulen, 0); + c = utf8_to_uv(s, send - s, &ulen, 0); if (c < 0x100 && (ch = tbl[(short)c]) >= 0) { matches++; if (ch < 0x80) @@ -125,7 +125,7 @@ S_do_trans_count(pTHX_ SV *sv)/* SPC - OK */ STRLEN ulen; ulen = 1; if (hasutf) - c = utf8_to_uv_chk(s, send - s, &ulen, 0); + c = utf8_to_uv(s, send - s, &ulen, 0); else c = *s; if (c < 0x100 && tbl[c] >= 0) @@ -364,7 +364,7 @@ S_do_trans_complex_utf8(pTHX_ SV *sv) /* SPC - NOT OK */ } else if (uv == none) { /* "none" is unmapped character */ STRLEN ulen; - *d++ = (U8)utf8_to_uv_chk(s, send - s, &ulen, 0); + *d++ = (U8)utf8_to_uv(s, send - s, &ulen, 0); s += ulen; puv = 0xfeedface; continue; @@ -405,7 +405,7 @@ S_do_trans_complex_utf8(pTHX_ SV *sv) /* SPC - NOT OK */ } else if (uv == none) { /* "none" is unmapped character */ STRLEN ulen; - *d++ = (U8)utf8_to_uv_chk(s, send - s, &ulen, 0); + *d++ = (U8)utf8_to_uv(s, send - s, &ulen, 0); s += ulen; continue; } @@ -550,9 +550,8 @@ Perl_do_vecget(pTHX_ SV *sv, I32 offset, I32 size) if (size < 1 || (size & (size-1))) /* size < 1 or not a power of two */ Perl_croak(aTHX_ "Illegal number of bits in vec"); - if (SvUTF8(sv)) { + if (SvUTF8(sv)) (void) Perl_sv_utf8_downgrade(aTHX_ sv, TRUE); - } offset *= size; /* turn into bit offset */ len = (offset + size + 7) / 8; /* required number of bytes */ @@ -969,10 +968,10 @@ Perl_do_vop(pTHX_ I32 optype, SV *sv, SV *left, SV *right) switch (optype) { case OP_BIT_AND: while (lulen && rulen) { - luc = utf8_to_uv_chk((U8*)lc, lulen, &ulen, 0); + luc = utf8_to_uv((U8*)lc, lulen, &ulen, 0); lc += ulen; lulen -= ulen; - ruc = utf8_to_uv_chk((U8*)rc, rulen, &ulen, 0); + ruc = utf8_to_uv((U8*)rc, rulen, &ulen, 0); rc += ulen; rulen -= ulen; duc = luc & ruc; @@ -984,10 +983,10 @@ Perl_do_vop(pTHX_ I32 optype, SV *sv, SV *left, SV *right) break; case OP_BIT_XOR: while (lulen && rulen) { - luc = utf8_to_uv_chk((U8*)lc, lulen, &ulen, 0); + luc = utf8_to_uv((U8*)lc, lulen, &ulen, 0); lc += ulen; lulen -= ulen; - ruc = utf8_to_uv_chk((U8*)rc, rulen, &ulen, 0); + ruc = utf8_to_uv((U8*)rc, rulen, &ulen, 0); rc += ulen; rulen -= ulen; duc = luc ^ ruc; @@ -996,10 +995,10 @@ Perl_do_vop(pTHX_ I32 optype, SV *sv, SV *left, SV *right) goto mop_up_utf; case OP_BIT_OR: while (lulen && rulen) { - luc = utf8_to_uv_chk((U8*)lc, lulen, &ulen, 0); + luc = utf8_to_uv((U8*)lc, lulen, &ulen, 0); lc += ulen; lulen -= ulen; - ruc = utf8_to_uv_chk((U8*)rc, rulen, &ulen, 0); + ruc = utf8_to_uv((U8*)rc, rulen, &ulen, 0); rc += ulen; rulen -= ulen; duc = luc | ruc; diff --git a/embed.h b/embed.h index eab037f..a588819 100644 --- a/embed.h +++ b/embed.h @@ -729,8 +729,8 @@ #define utf8_hop Perl_utf8_hop #define utf8_to_bytes Perl_utf8_to_bytes #define bytes_to_utf8 Perl_bytes_to_utf8 +#define utf8_to_uv_simple Perl_utf8_to_uv_simple #define utf8_to_uv Perl_utf8_to_uv -#define utf8_to_uv_chk Perl_utf8_to_uv_chk #define uv_to_utf8 Perl_uv_to_utf8 #define vivify_defelem Perl_vivify_defelem #define vivify_ref Perl_vivify_ref @@ -2189,8 +2189,8 @@ #define utf8_hop(a,b) Perl_utf8_hop(aTHX_ a,b) #define utf8_to_bytes(a,b) Perl_utf8_to_bytes(aTHX_ a,b) #define bytes_to_utf8(a,b) Perl_bytes_to_utf8(aTHX_ a,b) -#define utf8_to_uv(a,b) Perl_utf8_to_uv(aTHX_ a,b) -#define utf8_to_uv_chk(a,b,c,d) Perl_utf8_to_uv_chk(aTHX_ a,b,c,d) +#define utf8_to_uv_simple(a,b) Perl_utf8_to_uv_simple(aTHX_ a,b) +#define utf8_to_uv(a,b,c,d) Perl_utf8_to_uv(aTHX_ a,b,c,d) #define uv_to_utf8(a,b) Perl_uv_to_utf8(aTHX_ a,b) #define vivify_defelem(a) Perl_vivify_defelem(aTHX_ a) #define vivify_ref(a,b) Perl_vivify_ref(aTHX_ a,b) @@ -4290,10 +4290,10 @@ #define utf8_to_bytes Perl_utf8_to_bytes #define Perl_bytes_to_utf8 CPerlObj::Perl_bytes_to_utf8 #define bytes_to_utf8 Perl_bytes_to_utf8 +#define Perl_utf8_to_uv_simple CPerlObj::Perl_utf8_to_uv_simple +#define utf8_to_uv_simple Perl_utf8_to_uv_simple #define Perl_utf8_to_uv CPerlObj::Perl_utf8_to_uv #define utf8_to_uv Perl_utf8_to_uv -#define Perl_utf8_to_uv_chk CPerlObj::Perl_utf8_to_uv_chk -#define utf8_to_uv_chk Perl_utf8_to_uv_chk #define Perl_uv_to_utf8 CPerlObj::Perl_uv_to_utf8 #define uv_to_utf8 Perl_uv_to_utf8 #define Perl_vivify_defelem CPerlObj::Perl_vivify_defelem diff --git a/embed.pl b/embed.pl index 8f80bbf..62135fc 100755 --- a/embed.pl +++ b/embed.pl @@ -2074,9 +2074,9 @@ Ap |I32 |utf8_distance |U8 *a|U8 *b Ap |U8* |utf8_hop |U8 *s|I32 off ApM |U8* |utf8_to_bytes |U8 *s|STRLEN *len ApM |U8* |bytes_to_utf8 |U8 *s|STRLEN *len -Ap |UV |utf8_to_uv |U8 *s|STRLEN* retlen -Ap |UV |utf8_to_uv_chk |U8 *s|STRLEN curlen|STRLEN* retlen|U32 flags -Ap |U8* |uv_to_utf8 |U8 *d|UV uv +Ap |UV |utf8_to_uv_simple|U8 *s|STRLEN* retlen +Ap |UV |utf8_to_uv |U8 *s|STRLEN curlen|STRLEN* retlen|U32 flags +Ap |U8* |uv_to_utf8|U8 *d|UV uv p |void |vivify_defelem |SV* sv p |void |vivify_ref |SV* sv|U32 to_what p |I32 |wait4pid |Pid_t pid|int* statusp|int flags diff --git a/objXSUB.h b/objXSUB.h index bc04f03..4d5ff6b 100644 --- a/objXSUB.h +++ b/objXSUB.h @@ -1869,14 +1869,14 @@ #define Perl_bytes_to_utf8 pPerl->Perl_bytes_to_utf8 #undef bytes_to_utf8 #define bytes_to_utf8 Perl_bytes_to_utf8 +#undef Perl_utf8_to_uv_simple +#define Perl_utf8_to_uv_simple pPerl->Perl_utf8_to_uv_simple +#undef utf8_to_uv_simple +#define utf8_to_uv_simple Perl_utf8_to_uv_simple #undef Perl_utf8_to_uv #define Perl_utf8_to_uv pPerl->Perl_utf8_to_uv #undef utf8_to_uv #define utf8_to_uv Perl_utf8_to_uv -#undef Perl_utf8_to_uv_chk -#define Perl_utf8_to_uv_chk pPerl->Perl_utf8_to_uv_chk -#undef utf8_to_uv_chk -#define utf8_to_uv_chk Perl_utf8_to_uv_chk #undef Perl_uv_to_utf8 #define Perl_uv_to_utf8 pPerl->Perl_uv_to_utf8 #undef uv_to_utf8 diff --git a/op.c b/op.c index 9e256a3..1aa558e 100644 --- a/op.c +++ b/op.c @@ -2658,7 +2658,7 @@ Perl_pmtrans(pTHX_ OP *o, OP *expr, OP *repl) for (j = 0; j < i; j++) { U8 *s = cp[j]; I32 cur = j < i ? cp[j+1] - s : tend - s; - UV val = utf8_to_uv_chk(s, cur, &ulen, 0); + UV val = utf8_to_uv(s, cur, &ulen, 0); s += ulen; diff = val - nextmin; if (diff > 0) { @@ -2671,7 +2671,7 @@ Perl_pmtrans(pTHX_ OP *o, OP *expr, OP *repl) } } if (*s == 0xff) - val = utf8_to_uv_chk(s+1, cur - 1, &ulen, 0); + val = utf8_to_uv(s+1, cur - 1, &ulen, 0); if (val >= nextmin) nextmin = val + 1; } @@ -2698,11 +2698,11 @@ Perl_pmtrans(pTHX_ OP *o, OP *expr, OP *repl) while (t < tend || tfirst <= tlast) { /* see if we need more "t" chars */ if (tfirst > tlast) { - tfirst = (I32)utf8_to_uv_chk(t, tend - t, &ulen, 0); + tfirst = (I32)utf8_to_uv(t, tend - t, &ulen, 0); t += ulen; if (t < tend && *t == 0xff) { /* illegal utf8 val indicates range */ t++; - tlast = (I32)utf8_to_uv_chk(t, tend - t, &ulen, 0); + tlast = (I32)utf8_to_uv(t, tend - t, &ulen, 0); t += ulen; } else @@ -2712,11 +2712,11 @@ Perl_pmtrans(pTHX_ OP *o, OP *expr, OP *repl) /* now see if we need more "r" chars */ if (rfirst > rlast) { if (r < rend) { - rfirst = (I32)utf8_to_uv_chk(r, rend - r, &ulen, 0); + rfirst = (I32)utf8_to_uv(r, rend - r, &ulen, 0); r += ulen; if (r < rend && *r == 0xff) { /* illegal utf8 val indicates range */ r++; - rlast = (I32)utf8_to_uv_chk(r, rend - r, &ulen, 0); + rlast = (I32)utf8_to_uv(r, rend - r, &ulen, 0); r += ulen; } else diff --git a/perlapi.c b/perlapi.c index 1f1343d..efa7164 100644 --- a/perlapi.c +++ b/perlapi.c @@ -1327,7 +1327,7 @@ Perl_to_uni_lower_lc(pTHXo_ U32 c) } #undef Perl_is_utf8_char -int +STRLEN Perl_is_utf8_char(pTHXo_ U8 *p) { return ((CPerlObj*)pPerl)->Perl_is_utf8_char(p); @@ -3378,18 +3378,18 @@ Perl_bytes_to_utf8(pTHXo_ U8 *s, STRLEN *len) return ((CPerlObj*)pPerl)->Perl_bytes_to_utf8(s, len); } -#undef Perl_utf8_to_uv +#undef Perl_utf8_to_uv_simple UV -Perl_utf8_to_uv(pTHXo_ U8 *s, STRLEN* retlen) +Perl_utf8_to_uv_simple(pTHXo_ U8 *s, STRLEN* retlen) { - return ((CPerlObj*)pPerl)->Perl_utf8_to_uv(s, retlen); + return ((CPerlObj*)pPerl)->Perl_utf8_to_uv_simple(s, retlen); } -#undef Perl_utf8_to_uv_chk +#undef Perl_utf8_to_uv UV -Perl_utf8_to_uv_chk(pTHXo_ U8 *s, STRLEN curlen, STRLEN* retlen, bool checking) +Perl_utf8_to_uv(pTHXo_ U8 *s, STRLEN curlen, STRLEN* retlen, U32 flags) { - return ((CPerlObj*)pPerl)->Perl_utf8_to_uv_chk(s, curlen, retlen, checking); + return ((CPerlObj*)pPerl)->Perl_utf8_to_uv(s, curlen, retlen, flags); } #undef Perl_uv_to_utf8 diff --git a/pod/perlapi.pod b/pod/perlapi.pod index 730d89f..634180f 100644 --- a/pod/perlapi.pod +++ b/pod/perlapi.pod @@ -2368,19 +2368,19 @@ false, defined or undefined. Does not handle 'get' magic. =for hackers Found in file sv.h -=item svtype +=item SvTYPE -An enum of flags for Perl types. These are found in the file B -in the C enum. Test these flags with the C macro. +Returns the type of the SV. See C. + + svtype SvTYPE(SV* sv) =for hackers Found in file sv.h -=item SvTYPE - -Returns the type of the SV. See C. +=item svtype - svtype SvTYPE(SV* sv) +An enum of flags for Perl types. These are found in the file B +in the C enum. Test these flags with the C macro. =for hackers Found in file sv.h @@ -3218,32 +3218,32 @@ Found in file utf8.c =item utf8_to_uv Returns the character value of the first character in the string C -which is assumed to be in UTF8 encoding; C will be set to the -length, in bytes, of that character, and the pointer C will be -advanced to the end of the character. +which is assumed to be in UTF8 encoding and no longer than C; +C will be set to the length, in bytes, of that character, +and the pointer C will be advanced to the end of the character. -If C does not point to a well-formed UTF8 character, an optional UTF8 +If C does not point to a well-formed UTF8 character, the behaviour +is dependent on the value of C: if this is true, it is +assumed that the caller will raise a warning, and this function will +set C to C<-1> and return. If C is not true, an optional UTF8 warning is produced. - U8* s utf8_to_uv(STRLEN *retlen) + U8* s utf8_to_uv(STRLEN curlen, I32 *retlen, U32 flags) =for hackers Found in file utf8.c -=item utf8_to_uv_chk +=item utf8_to_uv_simple Returns the character value of the first character in the string C -which is assumed to be in UTF8 encoding and no longer than C; -C will be set to the length, in bytes, of that character, -and the pointer C will be advanced to the end of the character. +which is assumed to be in UTF8 encoding; C will be set to the +length, in bytes, of that character, and the pointer C will be +advanced to the end of the character. -If C does not point to a well-formed UTF8 character, the behaviour -is dependent on the value of C: if this is true, it is -assumed that the caller will raise a warning, and this function will -set C to C<-1> and return. If C is not true, an optional UTF8 -warning is produced. +If C does not point to a well-formed UTF8 character, zero is +returned and retlen is set, if possible, to -1. - U8* s utf8_to_uv_chk(STRLEN curlen, I32 *retlen, I32 checking) + U8* s utf8_to_uv_simple(STRLEN *retlen) =for hackers Found in file utf8.c diff --git a/pod/perlunicode.pod b/pod/perlunicode.pod index 145c953..c9954d8 100644 --- a/pod/perlunicode.pod +++ b/pod/perlunicode.pod @@ -71,11 +71,6 @@ on Windows. Regardless of the above, the C pragma can always be used to force byte semantics in a particular lexical scope. See L. -One effect of the C pragma is that the internal UTF-8 decoding -becomes stricter so that the character 0xFFFF (UTF-8 bytes 0xEF 0xBF -0xBF), and the bytes 0xFE and 0xFF, start to cause warnings if they -appear in the data. - The C pragma is primarily a compatibility device that enables recognition of UTF-8 in literals encountered by the parser. It may also be used for enabling some of the more experimental Unicode support features. diff --git a/pp.c b/pp.c index ba50627..6d77ca1 100644 --- a/pp.c +++ b/pp.c @@ -1484,7 +1484,7 @@ PP(pp_complement) send = tmps + len; while (tmps < send) { - UV c = utf8_to_uv_chk(tmps, 0, &l, UTF8_ALLOW_ANY); + UV c = utf8_to_uv(tmps, 0, &l, UTF8_ALLOW_ANY); tmps += UTF8SKIP(tmps); targlen += UNISKIP(~c); } @@ -1493,7 +1493,7 @@ PP(pp_complement) tmps -= len; Newz(0, result, targlen + 1, U8); while (tmps < send) { - UV c = utf8_to_uv_chk(tmps, 0, &l, UTF8_ALLOW_ANY); + UV c = utf8_to_uv(tmps, 0, &l, UTF8_ALLOW_ANY); tmps += UTF8SKIP(tmps); result = uv_to_utf8(result,(UV)~c); } @@ -2240,7 +2240,7 @@ PP(pp_ord) STRLEN retlen; if ((*tmps & 0x80) && DO_UTF8(tmpsv)) - value = utf8_to_uv_chk(tmps, len, &retlen, 0); + value = utf8_to_uv(tmps, len, &retlen, 0); else value = (UV)(*tmps & 255); XPUSHu(value); @@ -2307,7 +2307,7 @@ PP(pp_ucfirst) STRLEN ulen; U8 tmpbuf[UTF8_MAXLEN]; U8 *tend; - UV uv = utf8_to_uv_chk(s, slen, &ulen, 0); + UV uv = utf8_to_uv(s, slen, &ulen, 0); if (PL_op->op_private & OPpLOCALE) { TAINT; @@ -2366,7 +2366,7 @@ PP(pp_lcfirst) STRLEN ulen; U8 tmpbuf[UTF8_MAXLEN]; U8 *tend; - UV uv = utf8_to_uv_chk(s, slen, &ulen, 0); + UV uv = utf8_to_uv(s, slen, &ulen, 0); if (PL_op->op_private & OPpLOCALE) { TAINT; @@ -2443,7 +2443,7 @@ PP(pp_uc) TAINT; SvTAINTED_on(TARG); while (s < send) { - d = uv_to_utf8(d, toUPPER_LC_uni( utf8_to_uv_chk(s, len, &ulen, 0))); + d = uv_to_utf8(d, toUPPER_LC_uni( utf8_to_uv(s, len, &ulen, 0))); s += ulen; } } @@ -2517,7 +2517,7 @@ PP(pp_lc) TAINT; SvTAINTED_on(TARG); while (s < send) { - d = uv_to_utf8(d, toLOWER_LC_uni( utf8_to_uv_chk(s, len, &ulen, 0))); + d = uv_to_utf8(d, toLOWER_LC_uni( utf8_to_uv(s, len, &ulen, 0))); s += ulen; } } @@ -3660,7 +3660,7 @@ PP(pp_unpack) if (checksum) { while (len-- > 0 && s < strend) { STRLEN alen; - auint = utf8_to_uv_chk((U8*)s, strend - s, &alen, 0); + auint = utf8_to_uv((U8*)s, strend - s, &alen, 0); along = alen; s += along; if (checksum > 32) @@ -3674,7 +3674,7 @@ PP(pp_unpack) EXTEND_MORTAL(len); while (len-- > 0 && s < strend) { STRLEN alen; - auint = utf8_to_uv_chk((U8*)s, strend - s, &alen, 0); + auint = utf8_to_uv((U8*)s, strend - s, &alen, 0); along = alen; s += along; sv = NEWSV(37, 0); diff --git a/pp_ctl.c b/pp_ctl.c index 33f91ee..a65cb1b 100644 --- a/pp_ctl.c +++ b/pp_ctl.c @@ -2975,13 +2975,13 @@ PP(pp_require) U8 *s = (U8*)SvPVX(sv); U8 *end = (U8*)SvPVX(sv) + SvCUR(sv); if (s < end) { - rev = utf8_to_uv_chk(s, end - s, &len, 0); + rev = utf8_to_uv(s, end - s, &len, 0); s += len; if (s < end) { - ver = utf8_to_uv_chk(s, end - s, &len, 0); + ver = utf8_to_uv(s, end - s, &len, 0); s += len; if (s < end) - sver = utf8_to_uv_chk(s, end - s, &len, 0); + sver = utf8_to_uv(s, end - s, &len, 0); } } if (PERL_REVISION < rev diff --git a/proto.h b/proto.h index 14a6e47..06ef1b1 100644 --- a/proto.h +++ b/proto.h @@ -809,8 +809,8 @@ PERL_CALLCONV I32 Perl_utf8_distance(pTHX_ U8 *a, U8 *b); PERL_CALLCONV U8* Perl_utf8_hop(pTHX_ U8 *s, I32 off); PERL_CALLCONV U8* Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len); PERL_CALLCONV U8* Perl_bytes_to_utf8(pTHX_ U8 *s, STRLEN *len); -PERL_CALLCONV UV Perl_utf8_to_uv(pTHX_ U8 *s, STRLEN* retlen); -PERL_CALLCONV UV Perl_utf8_to_uv_chk(pTHX_ U8 *s, STRLEN curlen, STRLEN* retlen, U32 flags); +PERL_CALLCONV UV Perl_utf8_to_uv_simple(pTHX_ U8 *s, STRLEN* retlen); +PERL_CALLCONV UV Perl_utf8_to_uv(pTHX_ U8 *s, STRLEN curlen, STRLEN* retlen, U32 flags); PERL_CALLCONV U8* Perl_uv_to_utf8(pTHX_ U8 *d, UV uv); PERL_CALLCONV void Perl_vivify_defelem(pTHX_ SV* sv); PERL_CALLCONV void Perl_vivify_ref(pTHX_ SV* sv, U32 to_what); diff --git a/regcomp.c b/regcomp.c index 3f2b10c..1946720 100644 --- a/regcomp.c +++ b/regcomp.c @@ -2884,7 +2884,7 @@ tryagain: default: normal_default: if ((*p & 0xc0) == 0xc0 && UTF) { - ender = utf8_to_uv_chk((U8*)p, PL_regxend - p, + ender = utf8_to_uv((U8*)p, PL_regxend - p, &numlen, 0); p += numlen; } @@ -3639,14 +3639,14 @@ S_regclassutf8(pTHX) namedclass = OOB_NAMEDCLASS; if (!range) rangebegin = PL_regcomp_parse; - value = utf8_to_uv_chk((U8*)PL_regcomp_parse, + value = utf8_to_uv((U8*)PL_regcomp_parse, PL_regxend - PL_regcomp_parse, &numlen, 0); PL_regcomp_parse += numlen; if (value == '[') namedclass = regpposixcc(value); else if (value == '\\') { - value = (U32)utf8_to_uv_chk((U8*)PL_regcomp_parse, + value = (U32)utf8_to_uv((U8*)PL_regcomp_parse, PL_regxend - PL_regcomp_parse, &numlen, 0); PL_regcomp_parse += numlen; diff --git a/regexec.c b/regexec.c index 350f432..a71f1d8 100644 --- a/regexec.c +++ b/regexec.c @@ -917,7 +917,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta PL_reg_flags |= RF_tainted; /* FALL THROUGH */ case BOUNDUTF8: - tmp = (I32)(s != startpos) ? utf8_to_uv_chk(reghop((U8*)s, -1), + tmp = (I32)(s != startpos) ? utf8_to_uv(reghop((U8*)s, -1), strend - s, 0, 0) : '\n'; tmp = ((OP(c) == BOUNDUTF8 ? isALNUM_uni(tmp) : isALNUM_LC_uni(tmp)) != 0); @@ -955,7 +955,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta PL_reg_flags |= RF_tainted; /* FALL THROUGH */ case NBOUNDUTF8: - tmp = (I32)(s != startpos) ? utf8_to_uv_chk(reghop((U8*)s, -1), + tmp = (I32)(s != startpos) ? utf8_to_uv(reghop((U8*)s, -1), strend - s, 0, 0) : '\n'; tmp = ((OP(c) == NBOUNDUTF8 ? isALNUM_uni(tmp) : isALNUM_LC_uni(tmp)) != 0); @@ -2002,7 +2002,7 @@ S_regmatch(pTHX_ regnode *prog) while (s < e) { if (l >= PL_regeol) sayNO; - if (utf8_to_uv_chk((U8*)s, e - s, 0, 0) != (c1 ? + if (utf8_to_uv((U8*)s, e - s, 0, 0) != (c1 ? toLOWER_utf8((U8*)l) : toLOWER_LC_utf8((U8*)l))) { @@ -2140,7 +2140,7 @@ S_regmatch(pTHX_ regnode *prog) case NBOUNDUTF8: /* was last char in word? */ ln = (locinput != PL_regbol) - ? utf8_to_uv_chk(reghop((U8*)locinput, -1), + ? utf8_to_uv(reghop((U8*)locinput, -1), PL_regeol - locinput, 0, 0) : PL_regprev; if (OP(scan) == BOUNDUTF8 || OP(scan) == NBOUNDUTF8) { ln = isALNUM_uni(ln); diff --git a/sv.c b/sv.c index 2790cfd..726f853 100644 --- a/sv.c +++ b/sv.c @@ -6364,7 +6364,7 @@ Perl_sv_vcatpvfn(pTHX_ SV *sv, const char *pat, STRLEN patlen, va_list *args, SV break; } if (utf) - iv = (IV)utf8_to_uv_chk(vecstr, veclen, &ulen, 0); + iv = (IV)utf8_to_uv(vecstr, veclen, &ulen, 0); else { iv = *vecstr; ulen = 1; @@ -6447,7 +6447,7 @@ Perl_sv_vcatpvfn(pTHX_ SV *sv, const char *pat, STRLEN patlen, va_list *args, SV break; } if (utf) - uv = utf8_to_uv_chk(vecstr, veclen, &ulen, 0); + uv = utf8_to_uv(vecstr, veclen, &ulen, 0); else { uv = *vecstr; ulen = 1; diff --git a/toke.c b/toke.c index 78ed359..3572b0e 100644 --- a/toke.c +++ b/toke.c @@ -816,7 +816,7 @@ Perl_str_to_version(pTHX_ SV *sv) STRLEN skip; UV n; if (utf) - n = utf8_to_uv_chk((U8*)start, len, &skip, 0); + n = utf8_to_uv((U8*)start, len, &skip, 0); else { n = *(U8*)start; skip = 1; @@ -1331,7 +1331,7 @@ S_scan_const(pTHX_ char *start) STRLEN len; UV uv; - uv = utf8_to_uv_chk((U8*)s, send - s, &len, UTF8_CHECK_ONLY); + uv = utf8_to_uv((U8*)s, send - s, &len, UTF8_CHECK_ONLY); if (len == 1) { /* Illegal UTF8 (a high-bit byte), make it valid. */ char *old_pvx = SvPVX(sv); diff --git a/utf8.c b/utf8.c index 7bb34b7..80f8846 100644 --- a/utf8.c +++ b/utf8.c @@ -171,7 +171,7 @@ Perl_is_utf8_string(pTHX_ U8 *s, STRLEN len) } /* -=for apidoc Am|U8* s|utf8_to_uv_chk|STRLEN curlen|I32 *retlen|U32 flags +=for apidoc Am|U8* s|utf8_to_uv|STRLEN curlen|I32 *retlen|U32 flags Returns the character value of the first character in the string C which is assumed to be in UTF8 encoding and no longer than C; @@ -179,16 +179,15 @@ C will be set to the length, in bytes, of that character, and the pointer C will be advanced to the end of the character. If C does not point to a well-formed UTF8 character, the behaviour -is dependent on the value of C: if this is true, it is -assumed that the caller will raise a warning, and this function will -set C to C<-1> and return. If C is not true, an optional UTF8 -warning is produced. +is dependent on the value of C: if it contains UTF8_CHECK_ONLY, +it is assumed that the caller will raise a warning, and this function +will set C to C<-1> and return. The C can also contain +various flags to allow deviations from the strict UTF-8 encoding. -=cut -*/ +=cut */ UV -Perl_utf8_to_uv_chk(pTHX_ U8* s, STRLEN curlen, STRLEN* retlen, U32 flags) +Perl_utf8_to_uv(pTHX_ U8* s, STRLEN curlen, STRLEN* retlen, U32 flags) { dTHR; UV uv = *s, ouv; @@ -324,7 +323,7 @@ malformed: } /* -=for apidoc Am|U8* s|utf8_to_uv|STRLEN *retlen +=for apidoc Am|U8* s|utf8_to_uv_simple|STRLEN *retlen Returns the character value of the first character in the string C which is assumed to be in UTF8 encoding; C will be set to the @@ -338,9 +337,9 @@ returned and retlen is set, if possible, to -1. */ UV -Perl_utf8_to_uv(pTHX_ U8* s, STRLEN* retlen) +Perl_utf8_to_uv_simple(pTHX_ U8* s, STRLEN* retlen) { - return Perl_utf8_to_uv_chk(aTHX_ s, (STRLEN)-1, retlen, 0); + return Perl_utf8_to_uv(aTHX_ s, (STRLEN)-1, retlen, 0); } /* utf8_distance(a,b) returns the number of UTF8 characters between @@ -400,30 +399,30 @@ Returns zero on failure, setting C to -1. U8 * Perl_utf8_to_bytes(pTHX_ U8* s, STRLEN *len) { - dTHR; U8 *send; U8 *d; - U8 *save; - - send = s + *len; - d = save = s; + U8 *save = s; /* ensure valid UTF8 and chars < 256 before updating string */ - while (s < send) { - U8 c = *s++; + for (send = s + *len; s < send; ) { + U8 c = *s++; + if (c >= 0x80 && - ( (s >= send) || ((*s++ & 0xc0) != 0x80) || ((c & 0xfe) != 0xc2))) { - *len = -1; - return 0; - } + ((s >= send) || + ((*s++ & 0xc0) != 0x80) || ((c & 0xfe) != 0xc2))) { + *len = -1; + return 0; + } } - s = save; + + d = s = save; while (s < send) { - if (*s < 0x80) - *d++ = *s++; + if (*s < 0x80) { + *d++ = *s++; + } else { STRLEN ulen; - *d++ = (U8)utf8_to_uv(s, &ulen); + *d++ = (U8)utf8_to_uv_simple(s, &ulen); s += ulen; } } @@ -951,7 +950,7 @@ Perl_to_utf8_upper(pTHX_ U8 *p) if (!PL_utf8_toupper) PL_utf8_toupper = swash_init("utf8", "ToUpper", &PL_sv_undef, 4, 0); uv = swash_fetch(PL_utf8_toupper, p); - return uv ? uv : utf8_to_uv_chk(p,STRLEN_MAX,0,0); + return uv ? uv : utf8_to_uv(p,STRLEN_MAX,0,0); } UV @@ -962,7 +961,7 @@ Perl_to_utf8_title(pTHX_ U8 *p) if (!PL_utf8_totitle) PL_utf8_totitle = swash_init("utf8", "ToTitle", &PL_sv_undef, 4, 0); uv = swash_fetch(PL_utf8_totitle, p); - return uv ? uv : utf8_to_uv_chk(p,STRLEN_MAX,0,0); + return uv ? uv : utf8_to_uv(p,STRLEN_MAX,0,0); } UV @@ -973,7 +972,7 @@ Perl_to_utf8_lower(pTHX_ U8 *p) if (!PL_utf8_tolower) PL_utf8_tolower = swash_init("utf8", "ToLower", &PL_sv_undef, 4, 0); uv = swash_fetch(PL_utf8_tolower, p); - return uv ? uv : utf8_to_uv_chk(p,STRLEN_MAX,0,0); + return uv ? uv : utf8_to_uv(p,STRLEN_MAX,0,0); } /* a "swash" is a swatch hash */ @@ -1063,7 +1062,7 @@ Perl_swash_fetch(pTHX_ SV *sv, U8 *ptr) PUSHMARK(SP); EXTEND(SP,3); PUSHs((SV*)sv); - PUSHs(sv_2mortal(newSViv(utf8_to_uv_chk(ptr, STRLEN_MAX, 0, 0) & ~(needents - 1)))); + PUSHs(sv_2mortal(newSViv(utf8_to_uv(ptr, STRLEN_MAX, 0, 0) & ~(needents - 1)))); PUSHs(sv_2mortal(newSViv(needents))); PUTBACK; if (call_method("SWASHGET", G_SCALAR)) -- 2.7.4