Convert some uvuni() to uvchr()
authorKarl Williamson <public@khwilliamson.com>
Sun, 24 Feb 2013 23:43:59 +0000 (16:43 -0700)
committerKarl Williamson <public@khwilliamson.com>
Thu, 29 Aug 2013 15:55:56 +0000 (09:55 -0600)
All the tables are now based on the native character set, so using
uvuni() in almost all cases is wrong.

cygwin/cygwin.c
doop.c
op.c
pp_pack.c
regcomp.c
regexec.c
toke.c
utf8.c

index 87401d1..e7be5e3 100644 (file)
@@ -156,7 +156,7 @@ wide_to_utf8(const wchar_t *wbuf)
     char *oldlocale = setlocale(LC_CTYPE, NULL);
     setlocale(LC_CTYPE, "utf-8");
 
-    /* uvuni_to_utf8(buf, chr) or Encoding::_bytes_to_utf8(sv, "UCS-2BE"); */
+    /* uvchr_to_utf8(buf, chr) or Encoding::_bytes_to_utf8(sv, "UCS-2BE"); */
     wlen = wcsrtombs(NULL, (const wchar_t **)&wbuf, wlen, NULL);
     buf = (char *) safemalloc(wlen+1);
     wcsrtombs(buf, (const wchar_t **)&wbuf, wlen, NULL);
@@ -176,7 +176,7 @@ utf8_to_wide(const char *buf)
 
     setlocale(LC_CTYPE, "utf-8");
     wbuf = (wchar_t *) safemalloc(wlen);
-    /* utf8_to_uvuni_buf(pathname, pathname + wlen, wpath) or Encoding::_utf8_to_bytes(sv, "UCS-2BE"); */
+    /* utf8_to_uvchr_buf(pathname, pathname + wlen, wpath) or Encoding::_utf8_to_bytes(sv, "UCS-2BE"); */
     wlen = mbsrtowcs(wbuf, (const char**)&buf, wlen, &mbs);
 
     if (oldlocale) setlocale(LC_CTYPE, oldlocale);
@@ -283,7 +283,7 @@ XS(XS_Cygwin_win_to_posix_path)
            mbstate_t mbs;
             char *oldlocale = setlocale(LC_CTYPE, NULL);
             setlocale(LC_CTYPE, "utf-8");
-           /* utf8_to_uvuni_buf(src_path, src_path + wlen, wpath) or Encoding::_utf8_to_bytes(sv, "UCS-2BE"); */
+           /* utf8_to_uvchr_buf(src_path, src_path + wlen, wpath) or Encoding::_utf8_to_bytes(sv, "UCS-2BE"); */
            wlen = mbsrtowcs(wpath, (const char**)&src_path, wlen, &mbs);
            if (wlen > 0)
                err = cygwin_conv_path(what, wpath, wbuf, wlen);
@@ -370,7 +370,7 @@ XS(XS_Cygwin_posix_to_win_path)
        setlocale(LC_CTYPE, "utf-8");
        if (!IN_BYTES) {
            mbstate_t mbs;
-           /* utf8_to_uvuni_buf(src_path, src_path + wlen, wpath) or Encoding::_utf8_to_bytes(sv, "UCS-2BE"); */
+           /* utf8_to_uvchr_buf(src_path, src_path + wlen, wpath) or Encoding::_utf8_to_bytes(sv, "UCS-2BE"); */
            wlen = mbsrtowcs(wpath, (const char**)&src_path, wlen, &mbs);
            if (wlen > 0)
                err = cygwin_conv_path(what, wpath, wbuf, wlen);
diff --git a/doop.c b/doop.c
index 5a819a7..ac11c73 100644 (file)
--- a/doop.c
+++ b/doop.c
@@ -361,7 +361,7 @@ S_do_trans_simple_utf8(pTHX_ SV * const sv)
        if (uv < none) {
            s += UTF8SKIP(s);
            matches++;
-           d = uvuni_to_utf8(d, uv);
+           d = uvchr_to_utf8(d, uv);
        }
        else if (uv == none) {
            const int i = UTF8SKIP(s);
@@ -372,7 +372,7 @@ S_do_trans_simple_utf8(pTHX_ SV * const sv)
        else if (uv == extra) {
            s += UTF8SKIP(s);
            matches++;
-           d = uvuni_to_utf8(d, final);
+           d = uvchr_to_utf8(d, final);
        }
        else
            s += UTF8SKIP(s);
@@ -532,7 +532,7 @@ S_do_trans_complex_utf8(pTHX_ SV * const sv)
                matches++;
                s += UTF8SKIP(s);
                if (uv != puv) {
-                   d = uvuni_to_utf8(d, uv);
+                   d = uvchr_to_utf8(d, uv);
                    puv = uv;
                }
                continue;
@@ -550,13 +550,13 @@ S_do_trans_complex_utf8(pTHX_ SV * const sv)
                if (havefinal) {
                    s += UTF8SKIP(s);
                    if (puv != final) {
-                       d = uvuni_to_utf8(d, final);
+                       d = uvchr_to_utf8(d, final);
                        puv = final;
                    }
                }
                else {
                    STRLEN len;
-                   uv = utf8n_to_uvuni(s, send - s, &len, UTF8_ALLOW_DEFAULT);
+                   uv = utf8n_to_uvchr(s, send - s, &len, UTF8_ALLOW_DEFAULT);
                    if (uv != puv) {
                        Move(s, d, len, U8);
                        d += len;
@@ -585,7 +585,7 @@ S_do_trans_complex_utf8(pTHX_ SV * const sv)
            if (uv < none) {
                matches++;
                s += UTF8SKIP(s);
-               d = uvuni_to_utf8(d, uv);
+               d = uvchr_to_utf8(d, uv);
                continue;
            }
            else if (uv == none) {      /* "none" is unmapped character */
@@ -598,7 +598,7 @@ S_do_trans_complex_utf8(pTHX_ SV * const sv)
            else if (uv == extra && !del) {
                matches++;
                s += UTF8SKIP(s);
-               d = uvuni_to_utf8(d, final);
+               d = uvchr_to_utf8(d, final);
                continue;
            }
            matches++;                  /* "none+1" is delete character */
diff --git a/op.c b/op.c
index 7d8ac92..23eeaaa 100644 (file)
--- a/op.c
+++ b/op.c
@@ -4129,11 +4129,11 @@ S_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
            i = 0;
            transv = newSVpvs("");
            while (t < tend) {
-               cp[2*i] = utf8n_to_uvuni(t, tend-t, &ulen, flags);
+               cp[2*i] = utf8n_to_uvchr(t, tend-t, &ulen, flags);
                t += ulen;
                if (t < tend && *t == ILLEGAL_UTF8_BYTE) {
                    t++;
-                   cp[2*i+1] = utf8n_to_uvuni(t, tend-t, &ulen, flags);
+                   cp[2*i+1] = utf8n_to_uvchr(t, tend-t, &ulen, flags);
                    t += ulen;
                }
                else {
@@ -4146,11 +4146,11 @@ S_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
                UV  val = cp[2*j];
                diff = val - nextmin;
                if (diff > 0) {
-                   t = uvuni_to_utf8(tmpbuf,nextmin);
+                   t = uvchr_to_utf8(tmpbuf,nextmin);
                    sv_catpvn(transv, (char*)tmpbuf, t - tmpbuf);
                    if (diff > 1) {
                        U8  range_mark = ILLEGAL_UTF8_BYTE;
-                       t = uvuni_to_utf8(tmpbuf, val - 1);
+                       t = uvchr_to_utf8(tmpbuf, val - 1);
                        sv_catpvn(transv, (char *)&range_mark, 1);
                        sv_catpvn(transv, (char*)tmpbuf, t - tmpbuf);
                    }
@@ -4159,13 +4159,13 @@ S_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
                if (val >= nextmin)
                    nextmin = val + 1;
            }
-           t = uvuni_to_utf8(tmpbuf,nextmin);
+           t = uvchr_to_utf8(tmpbuf,nextmin);
            sv_catpvn(transv, (char*)tmpbuf, t - tmpbuf);
            {
                U8 range_mark = ILLEGAL_UTF8_BYTE;
                sv_catpvn(transv, (char *)&range_mark, 1);
            }
-           t = uvuni_to_utf8(tmpbuf, 0x7fffffff);
+           t = uvchr_to_utf8(tmpbuf, 0x7fffffff);
            sv_catpvn(transv, (char*)tmpbuf, t - tmpbuf);
            t = (const U8*)SvPVX_const(transv);
            tlen = SvCUR(transv);
@@ -4186,11 +4186,11 @@ S_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
        while (t < tend || tfirst <= tlast) {
            /* see if we need more "t" chars */
            if (tfirst > tlast) {
-               tfirst = (I32)utf8n_to_uvuni(t, tend - t, &ulen, flags);
+               tfirst = (I32)utf8n_to_uvchr(t, tend - t, &ulen, flags);
                t += ulen;
                if (t < tend && *t == ILLEGAL_UTF8_BYTE) {      /* illegal utf8 val indicates range */
                    t++;
-                   tlast = (I32)utf8n_to_uvuni(t, tend - t, &ulen, flags);
+                   tlast = (I32)utf8n_to_uvchr(t, tend - t, &ulen, flags);
                    t += ulen;
                }
                else
@@ -4200,11 +4200,11 @@ S_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
            /* now see if we need more "r" chars */
            if (rfirst > rlast) {
                if (r < rend) {
-                   rfirst = (I32)utf8n_to_uvuni(r, rend - r, &ulen, flags);
+                   rfirst = (I32)utf8n_to_uvchr(r, rend - r, &ulen, flags);
                    r += ulen;
                    if (r < rend && *r == ILLEGAL_UTF8_BYTE) {  /* illegal utf8 val indicates range */
                        r++;
-                       rlast = (I32)utf8n_to_uvuni(r, rend - r, &ulen, flags);
+                       rlast = (I32)utf8n_to_uvchr(r, rend - r, &ulen, flags);
                        r += ulen;
                    }
                    else
index 39f862e..588e448 100644 (file)
--- a/pp_pack.c
+++ b/pp_pack.c
@@ -319,7 +319,7 @@ uni_to_bytes(pTHX_ const char **s, const char *end, const char *buf, int buf_len
            const int flags = ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY;
            for (ptr = *s; ptr < from; ptr += UTF8SKIP(ptr)) {
                if (ptr >= end) break;
-               utf8n_to_uvuni((U8 *) ptr, end-ptr, &retlen, flags);
+               utf8n_to_uvchr((U8 *) ptr, end-ptr, &retlen, flags);
            }
            if (from > end) from = end;
        }
@@ -1316,10 +1316,10 @@ S_unpack_rec(pTHX_ tempsym_t* symptr, const char *s, const char *strbeg, const c
                    len = UTF8SKIP(result);
                    if (!uni_to_bytes(aTHX_ &ptr, strend,
                                      (char *) &result[1], len-1, 'U')) break;
-                   auv = utf8n_to_uvuni(result, len, &retlen, UTF8_ALLOW_DEFAULT);
+                   auv = utf8n_to_uvchr(result, len, &retlen, UTF8_ALLOW_DEFAULT);
                    s = ptr;
                } else {
-                   auv = utf8n_to_uvuni((U8*)s, strend - s, &retlen, UTF8_ALLOW_DEFAULT);
+                   auv = utf8n_to_uvchr((U8*)s, strend - s, &retlen, UTF8_ALLOW_DEFAULT);
                    if (retlen == (STRLEN) -1 || retlen == 0)
                        Perl_croak(aTHX_ "Malformed UTF-8 string in unpack");
                    s += retlen;
@@ -2585,8 +2585,8 @@ S_pack_rec(pTHX_ SV *cat, tempsym_t* symptr, SV **beglist, SV **endlist )
                        GROWING(0, cat, start, cur, len+UTF8_MAXLEN);
                        end = start+SvLEN(cat)-UTF8_MAXLEN;
                    }
-                   cur = (char *) uvuni_to_utf8_flags((U8 *) cur,
-                                                      NATIVE_TO_UNI(auv),
+                   cur = (char *) uvchr_to_utf8_flags((U8 *) cur,
+                                                      auv,
                                                       warn_utf8 ?
                                                       0 : UNICODE_ALLOW_ANY);
                } else {
@@ -2639,7 +2639,7 @@ S_pack_rec(pTHX_ SV *cat, tempsym_t* symptr, SV **beglist, SV **endlist )
                auv = SvUV(fromstr);
                if (utf8) {
                    U8 buffer[UTF8_MAXLEN], *endb;
-                   endb = uvuni_to_utf8_flags(buffer, auv,
+                   endb = uvchr_to_utf8_flags(buffer, auv,
                                               warn_utf8 ?
                                               0 : UNICODE_ALLOW_ANY);
                    if (cur+(endb-buffer)*UTF8_EXPAND >= end) {
@@ -2657,7 +2657,7 @@ S_pack_rec(pTHX_ SV *cat, tempsym_t* symptr, SV **beglist, SV **endlist )
                        GROWING(0, cat, start, cur, len+UTF8_MAXLEN);
                        end = start+SvLEN(cat)-UTF8_MAXLEN;
                    }
-                   cur = (char *) uvuni_to_utf8_flags((U8 *) cur, auv,
+                   cur = (char *) uvchr_to_utf8_flags((U8 *) cur, auv,
                                                       warn_utf8 ?
                                                       0 : UNICODE_ALLOW_ANY);
                }
index bb89a54..3bb13a9 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -1443,7 +1443,7 @@ and would end up looking like:
    8: EXACT <baz>(10)
   10: END(0)
 
-    d = uvuni_to_utf8_flags(d, uv, 0);
+    d = uvchr_to_utf8_flags(d, uv, 0);
 
 is the recommended Unicode-aware way of saying
 
@@ -1455,7 +1455,7 @@ is the recommended Unicode-aware way of saying
        if (UTF) {                                                         \
             SV *zlopp = newSV(7); /* XXX: optimize me */                   \
            unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp);      \
-            unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, val); \
+            unsigned const char *const kapow = uvchr_to_utf8(flrbbbbb, val); \
            SvCUR_set(zlopp, kapow - flrbbbbb);                            \
            SvPOK_on(zlopp);                                               \
            SvUTF8_on(zlopp);                                              \
@@ -1470,12 +1470,12 @@ is the recommended Unicode-aware way of saying
     wordlen++;                                                                          \
     if ( UTF ) {                                                                        \
         /* if it is UTF then it is either already folded, or does not need folding */   \
-        uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags);             \
+        uvc = utf8n_to_uvchr( (const U8*) uc, UTF8_MAXLEN, &len, uniflags);             \
     }                                                                                   \
     else if (folder == PL_fold_latin1) {                                                \
         /* if we use this folder we have to obey unicode rules on latin-1 data */       \
         if ( foldlen > 0 ) {                                                            \
-           uvc = utf8n_to_uvuni( (const U8*) scan, UTF8_MAXLEN, &len, uniflags );       \
+           uvc = utf8n_to_uvchr( (const U8*) scan, UTF8_MAXLEN, &len, uniflags );       \
            foldlen -= len;                                                              \
            scan += len;                                                                 \
            len = 0;                                                                     \
index db6b730..ce22a00 100644 (file)
--- a/regexec.c
+++ b/regexec.c
@@ -1238,7 +1238,7 @@ STMT_START {                               \
     switch (trie_type) {                                                            \
     case trie_utf8_fold:                                                            \
         if ( foldlen>0 ) {                                                          \
-            uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
+            uvc = utf8n_to_uvchr( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
             foldlen -= len;                                                         \
             uscan += len;                                                           \
             len=0;                                                                  \
@@ -1252,7 +1252,7 @@ STMT_START {                               \
         break;                                                                      \
     case trie_latin_utf8_fold:                                                      \
         if ( foldlen>0 ) {                                                          \
-            uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
+            uvc = utf8n_to_uvchr( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
             foldlen -= len;                                                         \
             uscan += len;                                                           \
             len=0;                                                                  \
@@ -1265,7 +1265,7 @@ STMT_START {                               \
         }                                                                           \
         break;                                                                      \
     case trie_utf8:                                                                 \
-        uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags );        \
+        uvc = utf8n_to_uvchr( (const U8*) uc, UTF8_MAXLEN, &len, uniflags );        \
         break;                                                                      \
     case trie_plain:                                                                \
         uvc = (UV)*uc;                                                              \
@@ -4050,7 +4050,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
 
                    while (chars) {
                        if (utf8_target) {
-                           uvc = utf8n_to_uvuni((U8*)uc, UTF8_MAXLEN, &len,
+                           uvc = utf8n_to_uvchr((U8*)uc, UTF8_MAXLEN, &len,
                                                    uniflags);
                            uc += len;
                        }
@@ -4063,7 +4063,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
                        while (foldlen) {
                            if (!--chars)
                                break;
-                           uvc = utf8n_to_uvuni(uscan, UTF8_MAXLEN, &len,
+                           uvc = utf8n_to_uvchr(uscan, UTF8_MAXLEN, &len,
                                            uniflags);
                            uscan += len;
                            foldlen -= len;
diff --git a/toke.c b/toke.c
index 7b397e3..53f1f4f 100644 (file)
--- a/toke.c
+++ b/toke.c
@@ -1053,7 +1053,7 @@ Perl_lex_stuff_pvn(pTHX_ const char *pv, STRLEN len, U32 flags)
                    ENTER;
                    SAVESPTR(PL_warnhook);
                    PL_warnhook = PERL_WARNHOOK_FATAL;
-                   utf8n_to_uvuni((U8*)p, e-p, NULL, 0);
+                   utf8n_to_uvchr((U8*)p, e-p, NULL, 0);
                    LEAVE;
                }
            }
@@ -1437,13 +1437,13 @@ Perl_lex_peek_unichar(pTHX_ U32 flags)
                bufend = PL_parser->bufend;
            }
        }
-       unichar = utf8n_to_uvuni((U8*)s, bufend-s, &retlen, UTF8_CHECK_ONLY);
+       unichar = utf8n_to_uvchr((U8*)s, bufend-s, &retlen, UTF8_CHECK_ONLY);
        if (retlen == (STRLEN)-1) {
            /* malformed UTF-8 */
            ENTER;
            SAVESPTR(PL_warnhook);
            PL_warnhook = PERL_WARNHOOK_FATAL;
-           utf8n_to_uvuni((U8*)s, bufend-s, NULL, 0);
+           utf8n_to_uvchr((U8*)s, bufend-s, NULL, 0);
            LEAVE;
        }
        return unichar;
@@ -2761,7 +2761,7 @@ S_get_and_check_backslash_N_name(pTHX_ const char* s, const char* const e)
     {
         /* If warnings are on, this will print a more detailed analysis of what
          * is wrong than the error message below */
-        utf8n_to_uvuni(first_bad_char_loc,
+        utf8n_to_uvchr(first_bad_char_loc,
                        e - ((char *) first_bad_char_loc),
                        NULL, 0);
 
@@ -2903,7 +2903,7 @@ S_get_and_check_backslash_N_name(pTHX_ const char* s, const char* const e)
         if (! is_utf8_string_loc((U8 *) str, len, &first_bad_char_loc)) {
             /* If warnings are on, this will print a more detailed analysis of
              * what is wrong than the error message below */
-            utf8n_to_uvuni(first_bad_char_loc,
+            utf8n_to_uvchr(first_bad_char_loc,
                            (char *) first_bad_char_loc - str,
                            NULL, 0);
 
@@ -3442,7 +3442,7 @@ S_scan_const(pTHX_ char *start)
                     }
 
                     if (has_utf8) {
-                       d = (char*)uvuni_to_utf8((U8*)d, uv);
+                       d = (char*)uvchr_to_utf8((U8*)d, uv);
                        if (PL_lex_inwhat == OP_TRANS &&
                            PL_sublex_info.sub_op) {
                            PL_sublex_info.sub_op->op_private |=
diff --git a/utf8.c b/utf8.c
index 8d7e6de..3981fe8 100644 (file)
--- a/utf8.c
+++ b/utf8.c
@@ -2422,7 +2422,7 @@ Perl_to_utf8_case(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp,
              s = SvPV_const(*svp, len);
              if (len == 1)
                   /* EIGHTBIT */
-                  len = uvuni_to_utf8(ustrp, NATIVE_TO_UNI(*(U8*)s)) - ustrp;
+                  len = uvchr_to_utf8(ustrp, *(U8*)s) - ustrp;
              else {
                   Copy(s, ustrp, len, U8);
              }
@@ -3216,10 +3216,7 @@ Perl_swash_fetch(pTHX_ SV *swash, const U8 *ptr, bool do_utf8)
        /* If not cached, generate it via swatch_get */
        if (!svp || !SvPOK(*svp)
                 || !(tmps = (const U8*)SvPV_const(*svp, slen))) {
-           /* We use utf8n_to_uvuni() as we want an index into
-              Unicode tables, not a native character number.
-            */
-           const UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXBYTES, 0,
+           const UV code_point = utf8n_to_uvchr(ptr, UTF8_MAXBYTES, 0,
                                           ckWARN(WARN_UTF8) ?
                                           0 : UTF8_ALLOW_ANY);
            swatch = swatch_get(swash,
@@ -3904,7 +3901,7 @@ Perl__swash_inversion_hash(pTHX_ SV* const swash)
 
            /* The key is the inverse mapping */
            char key[UTF8_MAXBYTES+1];
-           char* key_end = (char *) uvuni_to_utf8((U8*) key, val);
+           char* key_end = (char *) uvchr_to_utf8((U8*) key, val);
            STRLEN key_len = key_end - key;
 
            /* Get the list for the map */