Use separate macros for byte vs uv Unicode
authorKarl Williamson <public@khwilliamson.com>
Mon, 25 Mar 2013 19:09:09 +0000 (13:09 -0600)
committerKarl Williamson <public@khwilliamson.com>
Wed, 11 Sep 2013 03:02:58 +0000 (21:02 -0600)
This removes a macro not yet even in a development release, and splits
its calls into two classes: those where the input is a byte; and those
where it can be any unsigned integer.  The byte implementation avoids a
function call on EBCDIC platforms.

dist/IO/IO.pm
dist/IO/IO.xs
doop.c
inline.h
pp.c
pp_pack.c
regcomp.c
sv.c
toke.c
utf8.c
utf8.h

index 35aba10..21583f5 100644 (file)
@@ -7,7 +7,7 @@ use Carp;
 use strict;
 use warnings;
 
-our $VERSION = "1.28";
+our $VERSION = "1.29";
 XSLoader::load 'IO', $VERSION;
 
 sub import {
index c603456..5ae41ae 100644 (file)
@@ -337,7 +337,7 @@ ungetc(handle, c)
                 croak("Negative character number in ungetc()");
 
             v = SvUV(c);
-            if (NATIVE_IS_INVARIANT(v) || (v <= 0xFF && !PerlIO_isutf8(handle)))
+            if (UVCHR_IS_INVARIANT(v) || (v <= 0xFF && !PerlIO_isutf8(handle)))
                 RETVAL = PerlIO_ungetc(handle, (int)v);
             else {
                 U8 buf[UTF8_MAXBYTES + 1], *end;
diff --git a/doop.c b/doop.c
index ac11c73..5031af8 100644 (file)
--- a/doop.c
+++ b/doop.c
@@ -331,7 +331,7 @@ S_do_trans_simple_utf8(pTHX_ SV * const sv)
        const U8 * const e = s + len;
        while (t < e) {
            const U8 ch = *t++;
-           hibit = !NATIVE_IS_INVARIANT(ch);
+           hibit = !NATIVE_BYTE_IS_INVARIANT(ch);
            if (hibit) {
                s = bytes_to_utf8(s, &len);
                break;
@@ -432,7 +432,7 @@ S_do_trans_count_utf8(pTHX_ SV * const sv)
        const U8 * const e = s + len;
        while (t < e) {
            const U8 ch = *t++;
-           hibit = !NATIVE_IS_INVARIANT(ch);
+           hibit = !NATIVE_BYTE_IS_INVARIANT(ch);
            if (hibit) {
                start = s = bytes_to_utf8(s, &len);
                break;
@@ -487,7 +487,7 @@ S_do_trans_complex_utf8(pTHX_ SV * const sv)
        const U8 * const e = s + len;
        while (t < e) {
            const U8 ch = *t++;
-           hibit = !NATIVE_IS_INVARIANT(ch);
+           hibit = !NATIVE_BYTE_IS_INVARIANT(ch);
            if (hibit) {
                s = bytes_to_utf8(s, &len);
                break;
index a2727f4..226970b 100644 (file)
--- a/inline.h
+++ b/inline.h
@@ -258,7 +258,7 @@ S_append_utf8_from_native_byte(const U8 byte, U8** dest)
 
     PERL_ARGS_ASSERT_APPEND_UTF8_FROM_NATIVE_BYTE;
 
-    if (NATIVE_IS_INVARIANT(byte))
+    if (NATIVE_BYTE_IS_INVARIANT(byte))
         *(*dest)++ = byte;
     else {
         *(*dest)++ = UTF8_EIGHT_BIT_HI(byte);
diff --git a/pp.c b/pp.c
index 6fc6c9f..860db37 100644 (file)
--- a/pp.c
+++ b/pp.c
@@ -4244,7 +4244,7 @@ PP(pp_fc)
                     for (; s < send; s++) {
                         STRLEN ulen;
                         UV fc = _to_uni_fold_flags(*s, tmpbuf, &ulen, flags);
-                        if NATIVE_IS_INVARIANT(fc) {
+                        if UVCHR_IS_INVARIANT(fc) {
                             if (full_folding
                                 && *s == LATIN_SMALL_LETTER_SHARP_S)
                             {
index 588e448..3c4e373 100644 (file)
--- a/pp_pack.c
+++ b/pp_pack.c
@@ -2003,7 +2003,7 @@ marked_upgrade(pTHX_ SV *sv, tempsym_t *sym_ptr) {
     from_start = SvPVX_const(sv);
     from_end = from_start + SvCUR(sv);
     for (from_ptr = from_start; from_ptr < from_end; from_ptr++)
-       if (!NATIVE_IS_INVARIANT(*from_ptr)) break;
+       if (!NATIVE_BYTE_IS_INVARIANT(*from_ptr)) break;
     if (from_ptr == from_end) {
        /* Simple case: no character needs to be changed */
        SvUTF8_on(sv);
index 3e6ec29..10b1aa3 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -1792,7 +1792,7 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs
                    if ( !UTF ) {
                        /* store first byte of utf8 representation of
                           variant codepoints */
-                       if (! NATIVE_IS_INVARIANT(uvc)) {
+                       if (! UVCHR_IS_INVARIANT(uvc)) {
                            TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
                        }
                    }
@@ -4976,7 +4976,7 @@ S_pat_upgrade_to_utf8(pTHX_ RExC_state_t * const pRExC_state,
     Newx(dst, *plen_p * 2 + 1, U8);
 
     while (s < *plen_p) {
-        if (NATIVE_IS_INVARIANT(src[s]))
+        if (NATIVE_BYTE_IS_INVARIANT(src[s]))
             dst[d]   = src[s];
         else {
             dst[d++] = UTF8_EIGHT_BIT_HI(src[s]);
@@ -11186,7 +11186,7 @@ tryagain:
                      * utf8.  If we start to fold non-UTF patterns, be sure to
                      * update join_exact() */
                     if (LOC && ender < 256) {
-                        if (NATIVE_IS_INVARIANT(ender)) {
+                        if (UVCHR_IS_INVARIANT(ender)) {
                             *s = (U8) ender;
                             foldlen = 1;
                         } else {
diff --git a/sv.c b/sv.c
index e7be001..a3c4752 100644 (file)
--- a/sv.c
+++ b/sv.c
@@ -3316,7 +3316,7 @@ Perl_sv_utf8_upgrade_flags_grow(pTHX_ SV *const sv, const I32 flags, STRLEN extr
 
        while (t < e) {
            const U8 ch = *t++;
-           if (NATIVE_IS_INVARIANT(ch)) continue;
+           if (NATIVE_BYTE_IS_INVARIANT(ch)) continue;
 
            t--;    /* t already incremented; re-point to first variant */
            two_byte_count = 1;
@@ -3451,7 +3451,7 @@ must_be_utf8:
 
                while (d < e) {
                    const U8 chr = *d++;
-                   if (! NATIVE_IS_INVARIANT(chr)) two_byte_count++;
+                   if (! NATIVE_BYTE_IS_INVARIANT(chr)) two_byte_count++;
                }
 
                /* The string will expand by just the number of bytes that
@@ -3471,7 +3471,7 @@ must_be_utf8:
 
                e--;
                while (e >= t) {
-                   if (NATIVE_IS_INVARIANT(*e)) {
+                   if (NATIVE_BYTE_IS_INVARIANT(*e)) {
                        *d-- = *e;
                    } else {
                        *d-- = UTF8_EIGHT_BIT_LO(*e);
@@ -10879,7 +10879,7 @@ Perl_sv_vcatpvfn_flags(pTHX_ SV *const sv, const char *const pat, const STRLEN p
                goto unknown;
            uv = (args) ? va_arg(*args, int) : SvIV(argsv);
            if ((uv > 255 ||
-                (!NATIVE_IS_INVARIANT(uv) && SvUTF8(sv)))
+                (!UVCHR_IS_INVARIANT(uv) && SvUTF8(sv)))
                && !IN_BYTES) {
                eptr = (char*)utf8buf;
                elen = uvchr_to_utf8((U8*)eptr, uv) - utf8buf;
diff --git a/toke.c b/toke.c
index 968d30e..ef7d86b 100644 (file)
--- a/toke.c
+++ b/toke.c
@@ -3463,7 +3463,7 @@ S_scan_const(pTHX_ char *start)
                 * to recode the rest of the string into utf8 */
                
                /* Here uv is the ordinal of the next character being added */
-               if (!NATIVE_IS_INVARIANT(uv)) {
+               if (!UVCHR_IS_INVARIANT(uv)) {
                    if (!has_utf8 && uv > 255) {
                        /* Might need to recode whatever we have accumulated so
                         * far if it contains any chars variant in utf8 or
@@ -3797,7 +3797,7 @@ S_scan_const(pTHX_ char *start)
     default_action:
        /* If we started with encoded form, or already know we want it,
           then encode the next character */
-       if (! NATIVE_IS_INVARIANT((U8)(*s)) && (this_utf8 || has_utf8)) {
+       if (! NATIVE_BYTE_IS_INVARIANT((U8)(*s)) && (this_utf8 || has_utf8)) {
            STRLEN len  = 1;
 
 
@@ -11886,7 +11886,7 @@ Perl_scan_vstring(pTHX_ const char *s, const char *const e, SV *sv)
            /* Append native character for the rev point */
            tmpend = uvchr_to_utf8(tmpbuf, rev);
            sv_catpvn(sv, (const char*)tmpbuf, tmpend - tmpbuf);
-           if (!NATIVE_IS_INVARIANT(rev))
+           if (!UVCHR_IS_INVARIANT(rev))
                 SvUTF8_on(sv);
            if (pos + 1 < e && *pos == '.' && isDIGIT(pos[1]))
                 s = ++pos;
diff --git a/utf8.c b/utf8.c
index 51b9822..4745a63 100644 (file)
--- a/utf8.c
+++ b/utf8.c
@@ -1693,7 +1693,7 @@ Perl__to_upper_title_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp, const char S_
 
     assert(S_or_s == 'S' || S_or_s == 's');
 
-    if (NATIVE_IS_INVARIANT(converted)) { /* No difference between the two for
+    if (UVCHR_IS_INVARIANT(converted)) { /* No difference between the two for
                                             characters in this range */
        *p = (U8) converted;
        *lenp = 1;
@@ -1794,7 +1794,7 @@ S_to_lower_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp)
     U8 converted = toLOWER_LATIN1(c);
 
     if (p != NULL) {
-       if (NATIVE_IS_INVARIANT(converted)) {
+       if (NATIVE_BYTE_IS_INVARIANT(converted)) {
            *p = converted;
            *lenp = 1;
        }
@@ -1864,7 +1864,7 @@ Perl__to_fold_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp, const unsigned int f
        converted = toLOWER_LATIN1(c);
     }
 
-    if (NATIVE_IS_INVARIANT(converted)) {
+    if (UVCHR_IS_INVARIANT(converted)) {
        *p = (U8) converted;
        *lenp = 1;
     }
diff --git a/utf8.h b/utf8.h
index 7036488..5880aa3 100644 (file)
--- a/utf8.h
+++ b/utf8.h
@@ -338,7 +338,12 @@ Perl's extended UTF-8 means we can have start bytes up to FF.
  * UTF-8 encoded string) */
 #define UTF8_IS_INVARIANT(c)           UNI_IS_INVARIANT(NATIVE_UTF8_TO_I8(c))
 
-#define NATIVE_IS_INVARIANT(c)         UNI_IS_INVARIANT(NATIVE_TO_LATIN1(c))
+/* Like the above, but its name implies a non-UTF8 input, and is implemented
+ * differently (for no particular reason) */
+#define NATIVE_BYTE_IS_INVARIANT(c)    UNI_IS_INVARIANT(NATIVE_TO_LATIN1(c))
+
+/* Like the above, but accepts any UV as input */
+#define UVCHR_IS_INVARIANT(uv)          UNI_IS_INVARIANT(NATIVE_TO_UNI(uv))
 
 #define MAX_PORTABLE_UTF8_TWO_BYTE 0x3FF    /* constrained by EBCDIC */