From daf0f78e031c718c75590ef9ef573756f805776e Mon Sep 17 00:00:00 2001 From: Nick Ing-Simmons Date: Wed, 28 Mar 2001 14:38:24 +0000 Subject: [PATCH] More EBCDIC tweaks: - one more swash issue &~(0xA0-1) did not do the right thing, for UTF-EBCDIC where &~(0x80-1) does for UTF-8. - add "use re 'asciirange'" to make [!-~] etc. work use it in MIME::QuotedPrint and t/op/regexp.t and t/op/pat.t - Choose a key for t/op/each.t test which gets encoded. - Skip utf8decode if this is UTF-EBCDIC. p4raw-id: //depot/perlio@9400 --- ext/MIME/Base64/QuotedPrint.pm | 1 + ext/re/re.pm | 15 ++++++++------- perl.h | 1 + regcomp.c | 42 ++++++++++++++++++++++++++++++++++-------- t/op/each.t | 6 ++++-- t/op/pat.t | 3 +++ t/op/regexp.t | 2 ++ t/op/utf8decode.t | 20 +++++++++++++++++--- utf8.c | 3 ++- 9 files changed, 72 insertions(+), 21 deletions(-) diff --git a/ext/MIME/Base64/QuotedPrint.pm b/ext/MIME/Base64/QuotedPrint.pm index 069f322..b72a4b9 100644 --- a/ext/MIME/Base64/QuotedPrint.pm +++ b/ext/MIME/Base64/QuotedPrint.pm @@ -71,6 +71,7 @@ require Exporter; $VERSION = sprintf("%d.%02d", q$Revision: 2.3 $ =~ /(\d+)\.(\d+)/); +use re 'asciirange'; # ranges in regular expressions refer to ASCII sub encode_qp ($) { diff --git a/ext/re/re.pm b/ext/re/re.pm index 3f142d9..d66bda5 100644 --- a/ext/re/re.pm +++ b/ext/re/re.pm @@ -42,21 +42,21 @@ other transformations. When C is in effect, a regex is allowed to contain C<(?{ ... })> zero-width assertions even if regular expression contains -variable interpolation. That is normally disallowed, since it is a +variable interpolation. That is normally disallowed, since it is a potential security risk. Note that this pragma is ignored when the regular expression is obtained from tainted data, i.e. evaluation is always disallowed with tainted regular expresssions. See L. -For the purpose of this pragma, interpolation of precompiled regular +For the purpose of this pragma, interpolation of precompiled regular expressions (i.e., the result of C) is I considered variable interpolation. Thus: /foo${pat}bar/ -I allowed if $pat is a precompiled regular expression, even +I allowed if $pat is a precompiled regular expression, even if $pat contains C<(?{ ... })> assertions. -When C is in effect, perl emits debugging messages when +When C is in effect, perl emits debugging messages when compiling and using regular expressions. The output is the same as that obtained by running a C<-DDEBUGGING>-enabled perl interpreter with the B<-Dr> switch. It may be quite voluminous depending on the complexity @@ -64,7 +64,7 @@ of the match. Using C instead of C enables a form of output that can be used to get a colorful display on terminals that understand termcap color sequences. Set C<$ENV{PERL_RE_TC}> to a comma-separated list of C properties to use for highlighting -strings on/off, pre-point part on/off. +strings on/off, pre-point part on/off. See L for additional info. The directive C is I, as the @@ -77,8 +77,9 @@ See L. # N.B. File::Basename contains a literal for 'taint' as a fallback. If # taint is changed here, File::Basename must be updated as well. my %bitmask = ( -taint => 0x00100000, -eval => 0x00200000, +taint => 0x00100000, +eval => 0x00200000, +asciirange => 0x02000000, ); sub setcolor { diff --git a/perl.h b/perl.h index d1cb711..7e5d994 100644 --- a/perl.h +++ b/perl.h @@ -2807,6 +2807,7 @@ enum { /* pass one of these to get_vtbl */ #define HINT_FILETEST_ACCESS 0x00400000 #define HINT_UTF8 0x00800000 #define HINT_UTF8_DISTINCT 0x01000000 +#define HINT_RE_ASCIIR 0x02000000 /* Various states of an input record separator SV (rs, nrs) */ #define RsSNARF(sv) (! SvOK(sv)) diff --git a/regcomp.c b/regcomp.c index 33765ff..85f0e45 100644 --- a/regcomp.c +++ b/regcomp.c @@ -3402,9 +3402,16 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) for (value = 0; value < 128; value++) ANYOF_BITMAP_SET(ret, value); #else /* EBCDIC */ - for (value = 0; value < 256; value++) - if (isASCII(value)) - ANYOF_BITMAP_SET(ret, value); + for (value = 0; value < 256; value++) { + if (PL_hints & HINT_RE_ASCIIR) { + if (NATIVE_TO_ASCII(value) < 128) + ANYOF_BITMAP_SET(ret, value); + } + else { + if (isASCII(value)) + ANYOF_BITMAP_SET(ret, value); + } + } #endif /* EBCDIC */ } dont_optimize_invert = TRUE; @@ -3418,9 +3425,16 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) for (value = 128; value < 256; value++) ANYOF_BITMAP_SET(ret, value); #else /* EBCDIC */ - for (value = 0; value < 256; value++) - if (!isASCII(value)) - ANYOF_BITMAP_SET(ret, value); + for (value = 0; value < 256; value++) { + if (PL_hints & HINT_RE_ASCIIR) { + if (NATIVE_TO_ASCII(value) >= 128) + ANYOF_BITMAP_SET(ret, value); + } + else { + if (!isASCII(value)) + ANYOF_BITMAP_SET(ret, value); + } + } #endif /* EBCDIC */ } dont_optimize_invert = TRUE; @@ -3681,7 +3695,8 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) } /* end of namedclass \blah */ if (range) { - if (lastvalue > value) /* b-a */ { + if (((lastvalue > value) && !(PL_hints & HINT_RE_ASCIIR)) || + ((NATIVE_TO_UNI(lastvalue) > NATIVE_TO_UNI(value)) && (PL_hints & HINT_RE_ASCIIR))) /* b-a */ { Simple_vFAIL4("Invalid [] range \"%*.*s\"", RExC_parse - rangebegin, RExC_parse - rangebegin, @@ -3715,7 +3730,17 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) if (!SIZE_ONLY) { if (lastvalue < 256 && value < 256) { #ifdef EBCDIC /* EBCDIC, for example. */ - if ((isLOWER(lastvalue) && isLOWER(value)) || + if (PL_hints & HINT_RE_ASCIIR) { + IV i; + /* New style scheme for ranges: + * after : + * use re 'asciir'; + * do ranges in ASCII/Unicode space + */ + for (i = NATIVE_TO_ASCII(lastvalue) ; i <= NATIVE_TO_ASCII(value); i++) + ANYOF_BITMAP_SET(ret, ASCII_TO_NATIVE(i)); + } + else if ((isLOWER(lastvalue) && isLOWER(value)) || (isUPPER(lastvalue) && isUPPER(value))) { IV i; @@ -4519,3 +4544,4 @@ clear_re(pTHXo_ void *r) { ReREFCNT_dec((regexp *)r); } + diff --git a/t/op/each.t b/t/op/each.t index daddc9c..6dd1cea 100755 --- a/t/op/each.t +++ b/t/op/each.t @@ -165,14 +165,16 @@ print "ok 24\n"; use bytes (); -$d = pack("U*", 0xe3, 0x81, 0x82); +# on EBCDIC chars are mapped differently so pick something that needs encoding +# there too. +$d = pack("U*", 0xe3, 0x81, 0xAF); $ol = bytes::length($d); print "not " unless $ol > 3; print "ok 25\n"; %u = ($d => "downgrade"); for (keys %u) { use bytes; - print "not " if length ne 3 or $_ ne "\xe3\x81\x82"; + print "not " if length ne 3 or $_ ne "\xe3\x81\xAF"; print "ok 26\n"; } { diff --git a/t/op/pat.t b/t/op/pat.t index 4c48c33..c3024a2 100755 --- a/t/op/pat.t +++ b/t/op/pat.t @@ -11,6 +11,9 @@ BEGIN { chdir 't' if -d 't'; @INC = '../lib'; } + +use re 'asciirange'; # Compute ranges in ASCII space + eval 'use Config'; # Defaults assumed if this fails $x = "abc\ndef\n"; diff --git a/t/op/regexp.t b/t/op/regexp.t index 4a4d42f..0b81e71 100755 --- a/t/op/regexp.t +++ b/t/op/regexp.t @@ -36,6 +36,8 @@ BEGIN { @INC = '../lib'; } +use re 'asciirange'; # ranges are computed in ASCII + $iters = shift || 1; # Poor man performance suite, 10000 is OK. open(TESTS,'op/re_tests') || open(TESTS,'t/op/re_tests') || diff --git a/t/op/utf8decode.t b/t/op/utf8decode.t index 4d05a6b..824805d 100644 --- a/t/op/utf8decode.t +++ b/t/op/utf8decode.t @@ -3,6 +3,20 @@ BEGIN { chdir 't' if -d 't'; @INC = '../lib'; + +} + +{ + my $wide = v256; + use bytes; + print STDERR ord($wide),"\n"; + if (ord($wide) == 140) { + print "1..0 # Skip: UTF-EBCDIC (not UTF-8) used here\n"; + exit 0; + } + elsif (ord($wide) != 196) { + warn sprintf("v256 starts with %02X\n",ord($wide)); + } } no utf8; @@ -13,7 +27,7 @@ my $test = 1; # This table is based on Markus Kuhn's UTF-8 Decode Stress Tester, # http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt, -# version dated 2000-09-02. +# version dated 2000-09-02. # We use the \x notation instead of raw binary bytes for \x00-\x1f\x7f-\xff # because e.g. many patch programs have issues with binary data. @@ -21,7 +35,7 @@ my $test = 1; my @MK = split(/\n/, <<__EOMK__); 1 Correct UTF-8 1.1.1 y "\xce\xba\xe1\xbd\xb9\xcf\x83\xce\xbc\xce\xb5" - 11 ce:ba:e1:bd:b9:cf:83:ce:bc:ce:b5 5 -2 Boundary conditions +2 Boundary conditions 2.1 First possible sequence of certain length 2.1.1 y "\x00" 0 1 00 1 2.1.2 y "\xc2\x80" 80 2 c2:80 1 @@ -135,7 +149,7 @@ __EOMK__ sub moan { print "$id: @_"; } - + sub test_unpack_U { $WARNCNT = 0; $WARNMSG = ""; diff --git a/utf8.c b/utf8.c index 66d3fec..25cd0fd 100644 --- a/utf8.c +++ b/utf8.c @@ -1342,7 +1342,8 @@ Perl_swash_fetch(pTHX_ SV *sv, U8 *ptr) PUSHMARK(SP); EXTEND(SP,3); PUSHs((SV*)sv); - PUSHs(sv_2mortal(newSViv(code_point & ~(needents - 1)))); + /* On EBCDIC & ~(0xA0-1) isn't a useful thing to do */ + PUSHs(sv_2mortal(newSViv((klen) ? (code_point & ~(needents - 1)) : 0))); PUSHs(sv_2mortal(newSViv(needents))); PUTBACK; if (call_method("SWASHGET", G_SCALAR)) -- 2.7.4