Remove EBCDIC remappings
authorKarl Williamson <public@khwilliamson.com>
Sun, 17 Feb 2013 20:47:13 +0000 (13:47 -0700)
committerKarl Williamson <public@khwilliamson.com>
Thu, 29 Aug 2013 15:55:52 +0000 (09:55 -0600)
Now that the Unicode tables are stored in native format, we shouldn't be
doing remapping.

Note that this assumes that the Latin1 casing tables are stored in
native order; not all of this has been done yet.

handy.h
perly.c
pp.c
regcomp.c
regexec.c
utf8.c

diff --git a/handy.h b/handy.h
index 73fdaca..7b6db33 100644 (file)
--- a/handy.h
+++ b/handy.h
@@ -990,7 +990,7 @@ EXTCONST U32 PL_charclass[];
     /* The 1U keeps Solaris from griping when shifting sets the uppermost bit */
 #   define _CC_mask(classnum) (1U << (classnum))
 #   define _generic_isCC(c, classnum) cBOOL(FITS_IN_8_BITS(c) \
-                && (PL_charclass[(U8) NATIVE_TO_LATIN1(c)] & _CC_mask(classnum)))
+                && (PL_charclass[(U8) (c)] & _CC_mask(classnum)))
 
     /* The mask for the _A versions of the macros; it just adds in the bit for
      * ASCII. */
@@ -999,7 +999,7 @@ EXTCONST U32 PL_charclass[];
     /* The _A version makes sure that both the desired bit and the ASCII bit
      * are present */
 #   define _generic_isCC_A(c, classnum) (FITS_IN_8_BITS(c) \
-        && ((PL_charclass[(U8) NATIVE_TO_LATIN1(c)] & _CC_mask_A(classnum)) \
+        && ((PL_charclass[(U8) (c)] & _CC_mask_A(classnum)) \
                                 == _CC_mask_A(classnum)))
 
 #   define isALPHA_A(c)  _generic_isCC_A(c, _CC_ALPHA)
@@ -1020,7 +1020,7 @@ EXTCONST U32 PL_charclass[];
 
     /* Either participates in a fold with a character above 255, or is a
      * multi-char fold */
-#   define _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(c) ((! cBOOL(FITS_IN_8_BITS(c))) || (PL_charclass[(U8) NATIVE_TO_LATIN1(c)] & _CC_mask(_CC_NONLATIN1_FOLD)))
+#   define _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(c) ((! cBOOL(FITS_IN_8_BITS(c))) || (PL_charclass[(U8) (c)] & _CC_mask(_CC_NONLATIN1_FOLD)))
 
 #   define _isQUOTEMETA(c) _generic_isCC(c, _CC_QUOTEMETA)
 #   define _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c) \
@@ -1177,8 +1177,7 @@ EXTCONST U32 PL_charclass[];
  * out-of-range */
 #define toLOWER_LATIN1(c)    ((! FITS_IN_8_BITS(c))                        \
                              ? (c)                                         \
-                             : LATIN1_TO_NATIVE(PL_latin1_lc[              \
-                                            NATIVE_TO_LATIN1( (U8) (c)) ]))
+                             : PL_latin1_lc[ (U8) (c) ])
 #define toLOWER_L1(c)    toLOWER_LATIN1(c)  /* Synonym for consistency */
 
 /* Modified uc.  Is correct uc except for three non-ascii chars which are
@@ -1186,8 +1185,7 @@ EXTCONST U32 PL_charclass[];
  * character for input out-of-range */
 #define toUPPER_LATIN1_MOD(c) ((! FITS_IN_8_BITS(c))                       \
                                ? (c)                                       \
-                               : LATIN1_TO_NATIVE(PL_mod_latin1_uc[        \
-                                            NATIVE_TO_LATIN1( (U8) (c)) ]))
+                               : PL_mod_latin1_uc[ (U8) (c) ])
 #ifdef USE_NEXT_CTYPE
 
 #  define isALPHANUMERIC_LC(c) NXIsAlNum((unsigned int)(c))
diff --git a/perly.c b/perly.c
index d7d9ea3..5a934dc 100644 (file)
--- a/perly.c
+++ b/perly.c
@@ -342,9 +342,12 @@ Perl_yyparse (pTHX_ int gramtype)
        parser->yychar = yylex();
 #endif
 
+/* perly.tab is shipped based on an ASCII system; if it were to be regenerated
+ * on a platform that doesn't use ASCII, this translation back would need to be
+ * removed */
 #  ifdef EBCDIC
        if (parser->yychar >= 0 && parser->yychar < 255) {
-           parser->yychar = NATIVE_TO_ASCII(parser->yychar);
+           parser->yychar = NATIVE_TO_LATIN1(parser->yychar);
        }
 #  endif
     }
diff --git a/pp.c b/pp.c
index cd50626..111012d 100644 (file)
--- a/pp.c
+++ b/pp.c
@@ -4244,7 +4244,7 @@ PP(pp_fc)
                     for (; s < send; s++) {
                         STRLEN ulen;
                         UV fc = _to_uni_fold_flags(*s, tmpbuf, &ulen, flags);
-                        if UNI_IS_INVARIANT(fc) {
+                        if NATIVE_IS_INVARIANT(fc) {
                             if (full_folding
                                 && *s == LATIN_SMALL_LETTER_SHARP_S)
                             {
index 34aefa1..bb89a54 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -1737,7 +1737,7 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs
                    if ( !UTF ) {
                        /* store first byte of utf8 representation of
                           variant codepoints */
-                       if (! UNI_IS_INVARIANT(uvc)) {
+                       if (! NATIVE_IS_INVARIANT(uvc)) {
                            TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
                        }
                    }
@@ -4276,8 +4276,8 @@ PerlIO_printf(Perl_debug_log, "LHS=%"UVdf" RHS=%"UVdf"\n",
                        if (!(data->start_class->flags & ANYOF_LOCALE)) {
                            ANYOF_CLASS_CLEAR(data->start_class, classnum_to_namedclass(classnum) + 1);
                             for (value = 0; value < loop_max; value++) {
-                                if (! _generic_isCC(LATIN1_TO_NATIVE(value), classnum)) {
-                                    ANYOF_BITMAP_CLEAR(data->start_class, LATIN1_TO_NATIVE(value));
+                                if (! _generic_isCC(value, classnum)) {
+                                    ANYOF_BITMAP_CLEAR(data->start_class, value);
                                 }
                             }
                        }
@@ -4292,8 +4292,8 @@ PerlIO_printf(Perl_debug_log, "LHS=%"UVdf" RHS=%"UVdf"\n",
                         * in case it isn't a true locale-node.  This will
                         * create false positives if it truly is locale */
                         for (value = 0; value < loop_max; value++) {
-                            if (_generic_isCC(LATIN1_TO_NATIVE(value), classnum)) {
-                                ANYOF_BITMAP_SET(data->start_class, LATIN1_TO_NATIVE(value));
+                            if (_generic_isCC(value, classnum)) {
+                                ANYOF_BITMAP_SET(data->start_class, value);
                             }
                         }
                         }
@@ -4310,8 +4310,8 @@ PerlIO_printf(Perl_debug_log, "LHS=%"UVdf" RHS=%"UVdf"\n",
                        if (!(data->start_class->flags & ANYOF_LOCALE)) {
                            ANYOF_CLASS_CLEAR(data->start_class, classnum_to_namedclass(classnum));
                             for (value = 0; value < loop_max; value++) {
-                                if (_generic_isCC(LATIN1_TO_NATIVE(value), classnum)) {
-                                    ANYOF_BITMAP_CLEAR(data->start_class, LATIN1_TO_NATIVE(value));
+                                if (_generic_isCC(value, classnum)) {
+                                    ANYOF_BITMAP_CLEAR(data->start_class, value);
                                 }
                             }
                        }
@@ -4326,8 +4326,8 @@ PerlIO_printf(Perl_debug_log, "LHS=%"UVdf" RHS=%"UVdf"\n",
                         * case it isn't a true locale-node.  This will create
                         * false positives if it truly is locale */
                         for (value = 0; value < loop_max; value++) {
-                            if (! _generic_isCC(LATIN1_TO_NATIVE(value), classnum)) {
-                                ANYOF_BITMAP_SET(data->start_class, LATIN1_TO_NATIVE(value));
+                            if (! _generic_isCC(value, classnum)) {
+                                ANYOF_BITMAP_SET(data->start_class, value);
                             }
                         }
                         if (PL_regkind[OP(scan)] == NPOSIXD) {
@@ -10183,7 +10183,7 @@ S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state, regnode *node, I32
     if (! len_passed_in) {
         if (UTF) {
             if (FOLD && (! LOC || code_point > 255)) {
-                _to_uni_fold_flags(NATIVE_TO_UNI(code_point),
+                _to_uni_fold_flags(code_point,
                                    character,
                                    &len,
                                    FOLD_FLAGS_FULL | ((LOC)
index 384e4e7..db6b730 100644 (file)
--- a/regexec.c
+++ b/regexec.c
@@ -1629,13 +1629,13 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
     case BOUNDL:
         RXp_MATCH_TAINTED_on(prog);
         FBC_BOUND(isWORDCHAR_LC,
-                  isWORDCHAR_LC_uvchr(UNI_TO_NATIVE(tmp)),
+                  isWORDCHAR_LC_uvchr(tmp),
                   isWORDCHAR_LC_utf8((U8*)s));
         break;
     case NBOUNDL:
         RXp_MATCH_TAINTED_on(prog);
         FBC_NBOUND(isWORDCHAR_LC,
-                   isWORDCHAR_LC_uvchr(UNI_TO_NATIVE(tmp)),
+                   isWORDCHAR_LC_uvchr(tmp),
                    isWORDCHAR_LC_utf8((U8*)s));
         break;
     case BOUND:
@@ -4305,7 +4305,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
                     }
                }
                else {
-                   ln = isWORDCHAR_LC_uvchr(UNI_TO_NATIVE(ln));
+                   ln = isWORDCHAR_LC_uvchr(ln);
                    n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_LC_utf8((U8*)locinput);
                }
            }
diff --git a/utf8.c b/utf8.c
index 2d827a1..8d7e6de 100644 (file)
--- a/utf8.c
+++ b/utf8.c
@@ -1407,11 +1407,7 @@ Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
        UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
        p += 2;
        if (uv < 0x80) {
-#ifdef EBCDIC
-           *d++ = LATIN1_TO_NATIVE(uv);
-#else
            *d++ = (U8)uv;
-#endif
            continue;
        }
        if (uv < 0x800) {
@@ -1645,8 +1641,8 @@ Perl__to_upper_title_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp, const char S_
 
     assert(S_or_s == 'S' || S_or_s == 's');
 
-    if (UNI_IS_INVARIANT(converted)) { /* No difference between the two for
-                                         characters in this range */
+    if (NATIVE_IS_INVARIANT(converted)) { /* No difference between the two for
+                                            characters in this range */
        *p = (U8) converted;
        *lenp = 1;
        return converted;
@@ -1746,7 +1742,7 @@ S_to_lower_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp)
     U8 converted = toLOWER_LATIN1(c);
 
     if (p != NULL) {
-       if (UNI_IS_INVARIANT(converted)) {
+       if (NATIVE_IS_INVARIANT(converted)) {
            *p = converted;
            *lenp = 1;
        }
@@ -1816,7 +1812,7 @@ Perl__to_fold_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp, const unsigned int f
        converted = toLOWER_LATIN1(c);
     }
 
-    if (UNI_IS_INVARIANT(converted)) {
+    if (NATIVE_IS_INVARIANT(converted)) {
        *p = (U8) converted;
        *lenp = 1;
     }
@@ -1869,7 +1865,7 @@ bool
 Perl_is_uni_alnum_lc(pTHX_ UV c)
 {
     if (c < 256) {
-        return isALNUM_LC(UNI_TO_NATIVE(c));
+        return isALNUM_LC(c);
     }
     return _is_uni_FOO(_CC_WORDCHAR, c);
 }
@@ -1878,7 +1874,7 @@ bool
 Perl_is_uni_alnumc_lc(pTHX_ UV c)
 {
     if (c < 256) {
-        return isALPHANUMERIC_LC(UNI_TO_NATIVE(c));
+        return isALPHANUMERIC_LC(c);
     }
     return _is_uni_FOO(_CC_ALPHANUMERIC, c);
 }
@@ -1887,7 +1883,7 @@ bool
 Perl_is_uni_idfirst_lc(pTHX_ UV c)
 {
     if (c < 256) {
-        return isIDFIRST_LC(UNI_TO_NATIVE(c));
+        return isIDFIRST_LC(c);
     }
     return _is_uni_perl_idstart(c);
 }
@@ -1896,7 +1892,7 @@ bool
 Perl_is_uni_alpha_lc(pTHX_ UV c)
 {
     if (c < 256) {
-        return isALPHA_LC(UNI_TO_NATIVE(c));
+        return isALPHA_LC(c);
     }
     return _is_uni_FOO(_CC_ALPHA, c);
 }
@@ -1905,7 +1901,7 @@ bool
 Perl_is_uni_ascii_lc(pTHX_ UV c)
 {
     if (c < 256) {
-        return isASCII_LC(UNI_TO_NATIVE(c));
+        return isASCII_LC(c);
     }
     return 0;
 }
@@ -1914,7 +1910,7 @@ bool
 Perl_is_uni_blank_lc(pTHX_ UV c)
 {
     if (c < 256) {
-        return isBLANK_LC(UNI_TO_NATIVE(c));
+        return isBLANK_LC(c);
     }
     return isBLANK_uni(c);
 }
@@ -1923,7 +1919,7 @@ bool
 Perl_is_uni_space_lc(pTHX_ UV c)
 {
     if (c < 256) {
-        return isSPACE_LC(UNI_TO_NATIVE(c));
+        return isSPACE_LC(c);
     }
     return isSPACE_uni(c);
 }
@@ -1932,7 +1928,7 @@ bool
 Perl_is_uni_digit_lc(pTHX_ UV c)
 {
     if (c < 256) {
-        return isDIGIT_LC(UNI_TO_NATIVE(c));
+        return isDIGIT_LC(c);
     }
     return _is_uni_FOO(_CC_DIGIT, c);
 }
@@ -1941,7 +1937,7 @@ bool
 Perl_is_uni_upper_lc(pTHX_ UV c)
 {
     if (c < 256) {
-        return isUPPER_LC(UNI_TO_NATIVE(c));
+        return isUPPER_LC(c);
     }
     return _is_uni_FOO(_CC_UPPER, c);
 }
@@ -1950,7 +1946,7 @@ bool
 Perl_is_uni_lower_lc(pTHX_ UV c)
 {
     if (c < 256) {
-        return isLOWER_LC(UNI_TO_NATIVE(c));
+        return isLOWER_LC(c);
     }
     return _is_uni_FOO(_CC_LOWER, c);
 }
@@ -1959,7 +1955,7 @@ bool
 Perl_is_uni_cntrl_lc(pTHX_ UV c)
 {
     if (c < 256) {
-        return isCNTRL_LC(UNI_TO_NATIVE(c));
+        return isCNTRL_LC(c);
     }
     return 0;
 }
@@ -1968,7 +1964,7 @@ bool
 Perl_is_uni_graph_lc(pTHX_ UV c)
 {
     if (c < 256) {
-        return isGRAPH_LC(UNI_TO_NATIVE(c));
+        return isGRAPH_LC(c);
     }
     return _is_uni_FOO(_CC_GRAPH, c);
 }
@@ -1977,7 +1973,7 @@ bool
 Perl_is_uni_print_lc(pTHX_ UV c)
 {
     if (c < 256) {
-        return isPRINT_LC(UNI_TO_NATIVE(c));
+        return isPRINT_LC(c);
     }
     return _is_uni_FOO(_CC_PRINT, c);
 }
@@ -1986,7 +1982,7 @@ bool
 Perl_is_uni_punct_lc(pTHX_ UV c)
 {
     if (c < 256) {
-        return isPUNCT_LC(UNI_TO_NATIVE(c));
+        return isPUNCT_LC(c);
     }
     return _is_uni_FOO(_CC_PUNCT, c);
 }
@@ -1995,7 +1991,7 @@ bool
 Perl_is_uni_xdigit_lc(pTHX_ UV c)
 {
     if (c < 256) {
-       return isXDIGIT_LC(UNI_TO_NATIVE(c));
+       return isXDIGIT_LC(c);
     }
     return isXDIGIT_uni(c);
 }
@@ -2382,13 +2378,8 @@ Perl_to_utf8_case(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp,
                        SV **swashp, const char *normal, const char *special)
 {
     dVAR;
-    U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
     STRLEN len = 0;
-    const UV uv0 = valid_utf8_to_uvchr(p, NULL);
-    /* The NATIVE_TO_UNI() and UNI_TO_NATIVE() mappings
-     * are necessary in EBCDIC, they are redundant no-ops
-     * in ASCII-ish platforms, and hopefully optimized away. */
-    const UV uv1 = NATIVE_TO_UNI(uv0);
+    const UV uv1 = valid_utf8_to_uvchr(p, NULL);
 
     PERL_ARGS_ASSERT_TO_UTF8_CASE;
 
@@ -2414,8 +2405,6 @@ Perl_to_utf8_case(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp,
         * be given */
     }
 
-    uvuni_to_utf8(tmpbuf, uv1);
-
     if (!*swashp) /* load on-demand */
          *swashp = _core_swash_init("utf8", normal, &PL_sv_undef, 4, 0, NULL, NULL);
 
@@ -2426,56 +2415,26 @@ Perl_to_utf8_case(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp,
         SV **svp;
 
         if (hv &&
-            (svp = hv_fetch(hv, (const char*)tmpbuf, UNISKIP(uv1), FALSE)) &&
+            (svp = hv_fetch(hv, (const char*)p, UNISKIP(uv1), FALSE)) &&
             (*svp)) {
             const char *s;
 
              s = SvPV_const(*svp, len);
              if (len == 1)
+                  /* EIGHTBIT */
                   len = uvuni_to_utf8(ustrp, NATIVE_TO_UNI(*(U8*)s)) - ustrp;
              else {
-#ifdef EBCDIC
-                  /* If we have EBCDIC we need to remap the characters
-                   * since any characters in the low 256 are Unicode
-                   * code points, not EBCDIC. */
-                  U8 *t = (U8*)s, *tend = t + len, *d;
-               
-                  d = tmpbuf;
-                  if (SvUTF8(*svp)) {
-                       STRLEN tlen = 0;
-                       
-                       while (t < tend) {
-                            const UV c = utf8_to_uvchr_buf(t, tend, &tlen);
-                            if (tlen > 0) {
-                                 d = uvchr_to_utf8(d, UNI_TO_NATIVE(c));
-                                 t += tlen;
-                            }
-                            else
-                                 break;
-                       }
-                  }
-                  else {
-                       while (t < tend) {
-                            d = uvchr_to_utf8(d, UNI_TO_NATIVE(*t));
-                            t++;
-                       }
-                  }
-                  len = d - tmpbuf;
-                  Copy(tmpbuf, ustrp, len, U8);
-#else
                   Copy(s, ustrp, len, U8);
-#endif
              }
         }
     }
 
     if (!len && *swashp) {
-       const UV uv2 = swash_fetch(*swashp, tmpbuf, TRUE /* => is utf8 */);
+       const UV uv2 = swash_fetch(*swashp, p, TRUE /* => is utf8 */);
 
         if (uv2) {
              /* It was "normal" (a single character mapping). */
-             const UV uv3 = UNI_TO_NATIVE(uv2);
-             len = uvchr_to_utf8(ustrp, uv3) - ustrp;
+             len = uvchr_to_utf8(ustrp, uv2) - ustrp;
         }
     }
 
@@ -2496,7 +2455,7 @@ Perl_to_utf8_case(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp,
     if (lenp)
         *lenp = len;
 
-    return uv0;
+    return uv1;
 
 }
 
@@ -3195,7 +3154,7 @@ Perl_swash_fetch(pTHX_ SV *swash, const U8 *ptr, bool do_utf8)
     U32 bit;
     SV *swatch;
     U8 tmputf8[2];
-    const UV c = NATIVE_TO_ASCII(*ptr);
+    const UV c = *ptr;
 
     PERL_ARGS_ASSERT_SWASH_FETCH;
 
@@ -3209,7 +3168,7 @@ Perl_swash_fetch(pTHX_ SV *swash, const U8 *ptr, bool do_utf8)
     }
 
     /* Convert to utf8 if not already */
-    if (!do_utf8 && !UNI_IS_INVARIANT(c)) {
+    if (!do_utf8 && !NATIVE_IS_INVARIANT(c)) {
        tmputf8[0] = (U8)UTF8_EIGHT_BIT_HI(c);
        tmputf8[1] = (U8)UTF8_EIGHT_BIT_LO(c);
        ptr = tmputf8;
@@ -4605,7 +4564,7 @@ Perl_foldEQ_utf8_flags(pTHX_ const char *s1, char **pe1, UV l1, bool u1, const c
                    to_utf8_fold(p1, foldbuf1, &n1);
                }
                else {  /* Not utf8, get utf8 fold */
-                   to_uni_fold(NATIVE_TO_LATIN1(*p1), foldbuf1, &n1);
+                   to_uni_fold(*p1, foldbuf1, &n1);
                }
                f1 = foldbuf1;
            }
@@ -4650,7 +4609,7 @@ Perl_foldEQ_utf8_flags(pTHX_ const char *s1, char **pe1, UV l1, bool u1, const c
                    to_utf8_fold(p2, foldbuf2, &n2);
                }
                else {
-                   to_uni_fold(NATIVE_TO_LATIN1(*p2), foldbuf2, &n2);
+                   to_uni_fold(*p2, foldbuf2, &n2);
                }
                f2 = foldbuf2;
            }