From daf0f78e031c718c75590ef9ef573756f805776e Mon Sep 17 00:00:00 2001
From: Nick Ing-Simmons <nik@tiuk.ti.com>
Date: Wed, 28 Mar 2001 14:38:24 +0000
Subject: [PATCH] More EBCDIC tweaks:  - one more swash issue &~(0xA0-1) did
 not do the right thing,    for UTF-EBCDIC where &~(0x80-1) does for UTF-8.  -
 add "use re 'asciirange'" to make [!-~] etc. work    use it in
 MIME::QuotedPrint and t/op/regexp.t and t/op/pat.t  - Choose a key for
 t/op/each.t test which gets encoded.  - Skip utf8decode if this is
 UTF-EBCDIC.

p4raw-id: //depot/perlio@9400
---
 ext/MIME/Base64/QuotedPrint.pm |  1 +
 ext/re/re.pm                   | 15 ++++++++-------
 perl.h                         |  1 +
 regcomp.c                      | 42 ++++++++++++++++++++++++++++++++++--------
 t/op/each.t                    |  6 ++++--
 t/op/pat.t                     |  3 +++
 t/op/regexp.t                  |  2 ++
 t/op/utf8decode.t              | 20 +++++++++++++++++---
 utf8.c                         |  3 ++-
 9 files changed, 72 insertions(+), 21 deletions(-)
diff --git a/ext/MIME/Base64/QuotedPrint.pm b/ext/MIME/Base64/QuotedPrint.pm
index 069f322..b72a4b9 100644
--- a/ext/MIME/Base64/QuotedPrint.pm
+++ b/ext/MIME/Base64/QuotedPrint.pm
@@ -71,6 +71,7 @@ require Exporter;
 
 $VERSION = sprintf("%d.%02d", q$Revision: 2.3 $ =~ /(\d+)\.(\d+)/);
 
+use re 'asciirange'; # ranges in regular expressions refer to ASCII
 
 sub encode_qp ($)
 {
diff --git a/ext/re/re.pm b/ext/re/re.pm
index 3f142d9..d66bda5 100644
--- a/ext/re/re.pm
+++ b/ext/re/re.pm
@@ -42,21 +42,21 @@ other transformations.
 
 When C<use re 'eval'> is in effect, a regex is allowed to contain
 C<(?{ ... })> zero-width assertions even if regular expression contains
-variable interpolation.  That is normally disallowed, since it is a 
+variable interpolation.  That is normally disallowed, since it is a
 potential security risk.  Note that this pragma is ignored when the regular
 expression is obtained from tainted data, i.e.  evaluation is always
 disallowed with tainted regular expresssions.  See L<perlre/(?{ code })>.
 
-For the purpose of this pragma, interpolation of precompiled regular 
+For the purpose of this pragma, interpolation of precompiled regular
 expressions (i.e., the result of C<qr//>) is I<not> considered variable
 interpolation.  Thus:
 
     /foo${pat}bar/
 
-I<is> allowed if $pat is a precompiled regular expression, even 
+I<is> allowed if $pat is a precompiled regular expression, even
 if $pat contains C<(?{ ... })> assertions.
 
-When C<use re 'debug'> is in effect, perl emits debugging messages when 
+When C<use re 'debug'> is in effect, perl emits debugging messages when
 compiling and using regular expressions.  The output is the same as that
 obtained by running a C<-DDEBUGGING>-enabled perl interpreter with the
 B<-Dr> switch. It may be quite voluminous depending on the complexity
@@ -64,7 +64,7 @@ of the match.  Using C<debugcolor> instead of C<debug> enables a
 form of output that can be used to get a colorful display on terminals
 that understand termcap color sequences.  Set C<$ENV{PERL_RE_TC}> to a
 comma-separated list of C<termcap> properties to use for highlighting
-strings on/off, pre-point part on/off.  
+strings on/off, pre-point part on/off.
 See L<perldebug/"Debugging regular expressions"> for additional info.
 
 The directive C<use re 'debug'> is I<not lexically scoped>, as the
@@ -77,8 +77,9 @@ See L<perlmodlib/Pragmatic Modules>.
 # N.B. File::Basename contains a literal for 'taint' as a fallback.  If
 # taint is changed here, File::Basename must be updated as well.
 my %bitmask = (
-taint	=> 0x00100000,
-eval	=> 0x00200000,
+taint		=> 0x00100000,
+eval		=> 0x00200000,
+asciirange	=> 0x02000000,
 );
 
 sub setcolor {
diff --git a/perl.h b/perl.h
index d1cb711..7e5d994 100644
--- a/perl.h
+++ b/perl.h
@@ -2807,6 +2807,7 @@ enum {		/* pass one of these to get_vtbl */
 #define HINT_FILETEST_ACCESS	0x00400000
 #define HINT_UTF8		0x00800000
 #define HINT_UTF8_DISTINCT	0x01000000
+#define HINT_RE_ASCIIR		0x02000000
 
 /* Various states of an input record separator SV (rs, nrs) */
 #define RsSNARF(sv)   (! SvOK(sv))
diff --git a/regcomp.c b/regcomp.c
index 33765ff..85f0e45 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -3402,9 +3402,16 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
 			for (value = 0; value < 128; value++)
 			    ANYOF_BITMAP_SET(ret, value);
 #else  /* EBCDIC */
-			for (value = 0; value < 256; value++)
-			    if (isASCII(value))
-				ANYOF_BITMAP_SET(ret, value);
+			for (value = 0; value < 256; value++) {
+			    if (PL_hints & HINT_RE_ASCIIR) {
+				if (NATIVE_TO_ASCII(value) < 128)
+				    ANYOF_BITMAP_SET(ret, value);
+			    }
+			    else {
+				if (isASCII(value))
+				    ANYOF_BITMAP_SET(ret, value);
+			    }
+			}
 #endif /* EBCDIC */
 		    }
 		    dont_optimize_invert = TRUE;
@@ -3418,9 +3425,16 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
 			for (value = 128; value < 256; value++)
 			    ANYOF_BITMAP_SET(ret, value);
 #else  /* EBCDIC */
-			for (value = 0; value < 256; value++)
-			    if (!isASCII(value))
-				ANYOF_BITMAP_SET(ret, value);
+			for (value = 0; value < 256; value++) {
+			    if (PL_hints & HINT_RE_ASCIIR) {
+				if (NATIVE_TO_ASCII(value) >= 128)
+				    ANYOF_BITMAP_SET(ret, value);
+			    }
+			    else {
+				if (!isASCII(value))
+				    ANYOF_BITMAP_SET(ret, value);
+			    }
+			}
 #endif /* EBCDIC */
 		    }
 		    dont_optimize_invert = TRUE;
@@ -3681,7 +3695,8 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
 	} /* end of namedclass \blah */
 
 	if (range) {
-	    if (lastvalue > value) /* b-a */ {
+	    if (((lastvalue > value) && !(PL_hints & HINT_RE_ASCIIR)) ||
+                ((NATIVE_TO_UNI(lastvalue) > NATIVE_TO_UNI(value)) && (PL_hints & HINT_RE_ASCIIR))) /* b-a */ {
 		Simple_vFAIL4("Invalid [] range \"%*.*s\"",
 			      RExC_parse - rangebegin,
 			      RExC_parse - rangebegin,
@@ -3715,7 +3730,17 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
 	if (!SIZE_ONLY) {
 	    if (lastvalue < 256 && value < 256) {
 #ifdef EBCDIC /* EBCDIC, for example. */
-		if ((isLOWER(lastvalue) && isLOWER(value)) ||
+		if (PL_hints & HINT_RE_ASCIIR) {
+		    IV i;
+		    /* New style scheme for ranges:
+		     * after :
+		     * use re 'asciir';
+		     * do ranges in ASCII/Unicode space
+		     */
+		    for (i = NATIVE_TO_ASCII(lastvalue) ; i <= NATIVE_TO_ASCII(value); i++)
+			ANYOF_BITMAP_SET(ret, ASCII_TO_NATIVE(i));
+		}
+		else if ((isLOWER(lastvalue) && isLOWER(value)) ||
 		    (isUPPER(lastvalue) && isUPPER(value)))
 		{
 		    IV i;
@@ -4519,3 +4544,4 @@ clear_re(pTHXo_ void *r)
 {
     ReREFCNT_dec((regexp *)r);
 }
+
diff --git a/t/op/each.t b/t/op/each.t
index daddc9c..6dd1cea 100755
--- a/t/op/each.t
+++ b/t/op/each.t
@@ -165,14 +165,16 @@ print "ok 24\n";
 
 use bytes ();
 
-$d = pack("U*", 0xe3, 0x81, 0x82);
+# on EBCDIC chars are mapped differently so pick something that needs encoding
+# there too.
+$d = pack("U*", 0xe3, 0x81, 0xAF);
 $ol = bytes::length($d);
 print "not " unless $ol > 3;
 print "ok 25\n";
 %u = ($d => "downgrade");
 for (keys %u) {
     use bytes;
-    print "not " if length ne 3 or $_ ne "\xe3\x81\x82";
+    print "not " if length ne 3 or $_ ne "\xe3\x81\xAF";
     print "ok 26\n";
 }
 {
diff --git a/t/op/pat.t b/t/op/pat.t
index 4c48c33..c3024a2 100755
--- a/t/op/pat.t
+++ b/t/op/pat.t
@@ -11,6 +11,9 @@ BEGIN {
     chdir 't' if -d 't';
     @INC = '../lib';
 }
+
+use re 'asciirange'; # Compute ranges in ASCII space
+
 eval 'use Config';          #  Defaults assumed if this fails
 
 $x = "abc\ndef\n";
diff --git a/t/op/regexp.t b/t/op/regexp.t
index 4a4d42f..0b81e71 100755
--- a/t/op/regexp.t
+++ b/t/op/regexp.t
@@ -36,6 +36,8 @@ BEGIN {
     @INC = '../lib';
 }
 
+use re 'asciirange'; # ranges are computed in ASCII
+
 $iters = shift || 1;		# Poor man performance suite, 10000 is OK.
 
 open(TESTS,'op/re_tests') || open(TESTS,'t/op/re_tests') ||
diff --git a/t/op/utf8decode.t b/t/op/utf8decode.t
index 4d05a6b..824805d 100644
--- a/t/op/utf8decode.t
+++ b/t/op/utf8decode.t
@@ -3,6 +3,20 @@
 BEGIN {
     chdir 't' if -d 't';
     @INC = '../lib';
+
+}
+
+{
+    my $wide = v256;
+    use bytes;
+    print STDERR ord($wide),"\n";
+    if (ord($wide) == 140) {
+	print "1..0 # Skip: UTF-EBCDIC (not UTF-8) used here\n";
+	exit 0;
+    }
+    elsif (ord($wide) != 196) {
+	warn sprintf("v256 starts with %02X\n",ord($wide));
+    }
 }
 
 no utf8;
@@ -13,7 +27,7 @@ my $test = 1;
 
 # This table is based on Markus Kuhn's UTF-8 Decode Stress Tester,
 # http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt,
-# version dated 2000-09-02. 
+# version dated 2000-09-02.
 
 # We use the \x notation instead of raw binary bytes for \x00-\x1f\x7f-\xff
 # because e.g. many patch programs have issues with binary data.
@@ -21,7 +35,7 @@ my $test = 1;
 my @MK = split(/\n/, <<__EOMK__);
 1	Correct UTF-8
 1.1.1 y "\xce\xba\xe1\xbd\xb9\xcf\x83\xce\xbc\xce\xb5"	-		11	ce:ba:e1:bd:b9:cf:83:ce:bc:ce:b5	5
-2	Boundary conditions 
+2	Boundary conditions
 2.1	First possible sequence of certain length
 2.1.1 y "\x00"			0		1	00	1
 2.1.2 y "\xc2\x80"			80		2	c2:80	1
@@ -135,7 +149,7 @@ __EOMK__
     sub moan {
 	print "$id: @_";
     }
-    
+
     sub test_unpack_U {
 	$WARNCNT = 0;
 	$WARNMSG = "";
diff --git a/utf8.c b/utf8.c
index 66d3fec..25cd0fd 100644
--- a/utf8.c
+++ b/utf8.c
@@ -1342,7 +1342,8 @@ Perl_swash_fetch(pTHX_ SV *sv, U8 *ptr)
 	    PUSHMARK(SP);
 	    EXTEND(SP,3);
 	    PUSHs((SV*)sv);
-	    PUSHs(sv_2mortal(newSViv(code_point & ~(needents - 1))));
+	    /* On EBCDIC & ~(0xA0-1) isn't a useful thing to do */
+	    PUSHs(sv_2mortal(newSViv((klen) ? (code_point & ~(needents - 1)) : 0)));
 	    PUSHs(sv_2mortal(newSViv(needents)));
 	    PUTBACK;
 	    if (call_method("SWASHGET", G_SCALAR))
-- 
2.7.4