More EBCDIC tweaks:

author Nick Ing-Simmons <nik@tiuk.ti.com>

Wed, 28 Mar 2001 14:38:24 +0000 (14:38 +0000)

committer Nick Ing-Simmons <nik@tiuk.ti.com>

Wed, 28 Mar 2001 14:38:24 +0000 (14:38 +0000)
author Nick Ing-Simmons <nik@tiuk.ti.com>
Wed, 28 Mar 2001 14:38:24 +0000 (14:38 +0000)
committer Nick Ing-Simmons <nik@tiuk.ti.com>
Wed, 28 Mar 2001 14:38:24 +0000 (14:38 +0000)
diff --git a/ext/MIME/Base64/QuotedPrint.pm b/ext/MIME/Base64/QuotedPrint.pm

index 069f322..b72a4b9 100644 (file)
--- a/ext/MIME/Base64/QuotedPrint.pm
+++ b/ext/MIME/Base64/QuotedPrint.pm
@@ -71,6 +71,7 @@ require Exporter;
  
  $VERSION = sprintf("%d.%02d", q$Revision: 2.3 $ =~ /(\d+)\.(\d+)/);
  
+use re 'asciirange'; # ranges in regular expressions refer to ASCII
  
  sub encode_qp ($)
  {
diff --git a/ext/re/re.pm b/ext/re/re.pm

index 3f142d9..d66bda5 100644 (file)
--- a/ext/re/re.pm
+++ b/ext/re/re.pm
@@ -42,21 +42,21 @@ other transformations.
  
  When C<use re 'eval'> is in effect, a regex is allowed to contain
  C<(?{ ... })> zero-width assertions even if regular expression contains
-variable interpolation.  That is normally disallowed, since it is a 
+variable interpolation.  That is normally disallowed, since it is a
  potential security risk.  Note that this pragma is ignored when the regular
  expression is obtained from tainted data, i.e.  evaluation is always
  disallowed with tainted regular expresssions.  See L<perlre/(?{ code })>.
  
-For the purpose of this pragma, interpolation of precompiled regular 
+For the purpose of this pragma, interpolation of precompiled regular
  expressions (i.e., the result of C<qr//>) is I<not> considered variable
  interpolation.  Thus:
  
      /foo${pat}bar/
  
-I<is> allowed if $pat is a precompiled regular expression, even 
+I<is> allowed if $pat is a precompiled regular expression, even
  if $pat contains C<(?{ ... })> assertions.
  
-When C<use re 'debug'> is in effect, perl emits debugging messages when 
+When C<use re 'debug'> is in effect, perl emits debugging messages when
  compiling and using regular expressions.  The output is the same as that
  obtained by running a C<-DDEBUGGING>-enabled perl interpreter with the
  B<-Dr> switch. It may be quite voluminous depending on the complexity
@@ -64,7 +64,7 @@ of the match.  Using C<debugcolor> instead of C<debug> enables a
  form of output that can be used to get a colorful display on terminals
  that understand termcap color sequences.  Set C<$ENV{PERL_RE_TC}> to a
  comma-separated list of C<termcap> properties to use for highlighting
-strings on/off, pre-point part on/off.  
+strings on/off, pre-point part on/off.
  See L<perldebug/"Debugging regular expressions"> for additional info.
  
  The directive C<use re 'debug'> is I<not lexically scoped>, as the
@@ -77,8 +77,9 @@ See L<perlmodlib/Pragmatic Modules>.
  # N.B. File::Basename contains a literal for 'taint' as a fallback.  If
  # taint is changed here, File::Basename must be updated as well.
  my %bitmask = (
-taint  => 0x00100000,
-eval   => 0x00200000,
+taint          => 0x00100000,
+eval           => 0x00200000,
+asciirange     => 0x02000000,
  );
  
  sub setcolor {
diff --git a/perl.h b/perl.h

index d1cb711..7e5d994 100644 (file)
--- a/perl.h
+++ b/perl.h
@@ -2807,6 +2807,7 @@ enum {            /* pass one of these to get_vtbl */
  #define HINT_FILETEST_ACCESS   0x00400000
  #define HINT_UTF8              0x00800000
  #define HINT_UTF8_DISTINCT     0x01000000
+#define HINT_RE_ASCIIR         0x02000000
  
  /* Various states of an input record separator SV (rs, nrs) */
  #define RsSNARF(sv)   (! SvOK(sv))
diff --git a/regcomp.c b/regcomp.c

index 33765ff..85f0e45 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -3402,9 +3402,16 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
                         for (value = 0; value < 128; value++)
                             ANYOF_BITMAP_SET(ret, value);
  #else  /* EBCDIC */
-                       for (value = 0; value < 256; value++)
-                           if (isASCII(value))
-                               ANYOF_BITMAP_SET(ret, value);
+                       for (value = 0; value < 256; value++) {
+                           if (PL_hints & HINT_RE_ASCIIR) {
+                               if (NATIVE_TO_ASCII(value) < 128)
+                                   ANYOF_BITMAP_SET(ret, value);
+                           }
+                           else {
+                               if (isASCII(value))
+                                   ANYOF_BITMAP_SET(ret, value);
+                           }
+                       }
  #endif /* EBCDIC */
                     }
                     dont_optimize_invert = TRUE;
@@ -3418,9 +3425,16 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
                         for (value = 128; value < 256; value++)
                             ANYOF_BITMAP_SET(ret, value);
  #else  /* EBCDIC */
-                       for (value = 0; value < 256; value++)
-                           if (!isASCII(value))
-                               ANYOF_BITMAP_SET(ret, value);
+                       for (value = 0; value < 256; value++) {
+                           if (PL_hints & HINT_RE_ASCIIR) {
+                               if (NATIVE_TO_ASCII(value) >= 128)
+                                   ANYOF_BITMAP_SET(ret, value);
+                           }
+                           else {
+                               if (!isASCII(value))
+                                   ANYOF_BITMAP_SET(ret, value);
+                           }
+                       }
  #endif /* EBCDIC */
                     }
                     dont_optimize_invert = TRUE;
@@ -3681,7 +3695,8 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
         } /* end of namedclass \blah */
  
         if (range) {
-           if (lastvalue > value) /* b-a */ {
+           if (((lastvalue > value) && !(PL_hints & HINT_RE_ASCIIR)) ||
+                ((NATIVE_TO_UNI(lastvalue) > NATIVE_TO_UNI(value)) && (PL_hints & HINT_RE_ASCIIR))) /* b-a */ {
                 Simple_vFAIL4("Invalid [] range \"%*.*s\"",
                               RExC_parse - rangebegin,
                               RExC_parse - rangebegin,
@@ -3715,7 +3730,17 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
         if (!SIZE_ONLY) {
             if (lastvalue < 256 && value < 256) {
  #ifdef EBCDIC /* EBCDIC, for example. */
-               if ((isLOWER(lastvalue) && isLOWER(value)) ||
+               if (PL_hints & HINT_RE_ASCIIR) {
+                   IV i;
+                   /* New style scheme for ranges:
+                    * after :
+                    * use re 'asciir';
+                    * do ranges in ASCII/Unicode space
+                    */
+                   for (i = NATIVE_TO_ASCII(lastvalue) ; i <= NATIVE_TO_ASCII(value); i++)
+                       ANYOF_BITMAP_SET(ret, ASCII_TO_NATIVE(i));
+               }
+               else if ((isLOWER(lastvalue) && isLOWER(value)) ||
                     (isUPPER(lastvalue) && isUPPER(value)))
                 {
                     IV i;
@@ -4519,3 +4544,4 @@ clear_re(pTHXo_ void *r)
  {
      ReREFCNT_dec((regexp *)r);
  }
+
diff --git a/t/op/each.t b/t/op/each.t

index daddc9c..6dd1cea 100755 (executable)
--- a/t/op/each.t
+++ b/t/op/each.t
@@ -165,14 +165,16 @@ print "ok 24\n";
  
  use bytes ();
  
-$d = pack("U*", 0xe3, 0x81, 0x82);
+# on EBCDIC chars are mapped differently so pick something that needs encoding
+# there too.
+$d = pack("U*", 0xe3, 0x81, 0xAF);
  $ol = bytes::length($d);
  print "not " unless $ol > 3;
  print "ok 25\n";
  %u = ($d => "downgrade");
  for (keys %u) {
      use bytes;
-    print "not " if length ne 3 or $_ ne "\xe3\x81\x82";
+    print "not " if length ne 3 or $_ ne "\xe3\x81\xAF";
      print "ok 26\n";
  }
  {
diff --git a/t/op/pat.t b/t/op/pat.t

index 4c48c33..c3024a2 100755 (executable)
--- a/t/op/pat.t
+++ b/t/op/pat.t
@@ -11,6 +11,9 @@ BEGIN {
      chdir 't' if -d 't';
      @INC = '../lib';
  }
+
+use re 'asciirange'; # Compute ranges in ASCII space
+
  eval 'use Config';          #  Defaults assumed if this fails
  
  $x = "abc\ndef\n";
diff --git a/t/op/regexp.t b/t/op/regexp.t

index 4a4d42f..0b81e71 100755 (executable)
--- a/t/op/regexp.t
+++ b/t/op/regexp.t
@@ -36,6 +36,8 @@ BEGIN {
      @INC = '../lib';
  }
  
+use re 'asciirange'; # ranges are computed in ASCII
+
  $iters = shift || 1;           # Poor man performance suite, 10000 is OK.
  
  open(TESTS,'op/re_tests') || open(TESTS,'t/op/re_tests') ||
diff --git a/t/op/utf8decode.t b/t/op/utf8decode.t

index 4d05a6b..824805d 100644 (file)
--- a/t/op/utf8decode.t
+++ b/t/op/utf8decode.t
@@ -3,6 +3,20 @@
  BEGIN {
      chdir 't' if -d 't';
      @INC = '../lib';
+
+}
+
+{
+    my $wide = v256;
+    use bytes;
+    print STDERR ord($wide),"\n";
+    if (ord($wide) == 140) {
+       print "1..0 # Skip: UTF-EBCDIC (not UTF-8) used here\n";
+       exit 0;
+    }
+    elsif (ord($wide) != 196) {
+       warn sprintf("v256 starts with %02X\n",ord($wide));
+    }
  }
  
  no utf8;
@@ -13,7 +27,7 @@ my $test = 1;
  
  # This table is based on Markus Kuhn's UTF-8 Decode Stress Tester,
  # http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt,
-# version dated 2000-09-02. 
+# version dated 2000-09-02.
  
  # We use the \x notation instead of raw binary bytes for \x00-\x1f\x7f-\xff
  # because e.g. many patch programs have issues with binary data.
@@ -21,7 +35,7 @@ my $test = 1;
  my @MK = split(/\n/, <<__EOMK__);
  1      Correct UTF-8
  1.1.1 y "\xce\xba\xe1\xbd\xb9\xcf\x83\xce\xbc\xce\xb5" -               11      ce:ba:e1:bd:b9:cf:83:ce:bc:ce:b5        5
-2      Boundary conditions 
+2      Boundary conditions
  2.1    First possible sequence of certain length
  2.1.1 y "\x00"                 0               1       00      1
  2.1.2 y "\xc2\x80"                     80              2       c2:80   1
@@ -135,7 +149,7 @@ __EOMK__
      sub moan {
         print "$id: @_";
      }
-    
+
      sub test_unpack_U {
         $WARNCNT = 0;
         $WARNMSG = "";
diff --git a/utf8.c b/utf8.c

index 66d3fec..25cd0fd 100644 (file)
--- a/utf8.c
+++ b/utf8.c
@@ -1342,7 +1342,8 @@ Perl_swash_fetch(pTHX_ SV *sv, U8 *ptr)
             PUSHMARK(SP);
             EXTEND(SP,3);
             PUSHs((SV*)sv);
-           PUSHs(sv_2mortal(newSViv(code_point & ~(needents - 1))));
+           /* On EBCDIC & ~(0xA0-1) isn't a useful thing to do */
+           PUSHs(sv_2mortal(newSViv((klen) ? (code_point & ~(needents - 1)) : 0)));
             PUSHs(sv_2mortal(newSViv(needents)));
             PUTBACK;
             if (call_method("SWASHGET", G_SCALAR))
author	Nick Ing-Simmons <nik@tiuk.ti.com>
	Wed, 28 Mar 2001 14:38:24 +0000 (14:38 +0000)
committer	Nick Ing-Simmons <nik@tiuk.ti.com>
	Wed, 28 Mar 2001 14:38:24 +0000 (14:38 +0000)
ext/MIME/Base64/QuotedPrint.pm		patch \| blob \| history
ext/re/re.pm		patch \| blob \| history
perl.h		patch \| blob \| history
regcomp.c		patch \| blob \| history
t/op/each.t		patch \| blob \| history
t/op/pat.t		patch \| blob \| history
t/op/regexp.t		patch \| blob \| history
t/op/utf8decode.t		patch \| blob \| history
utf8.c		patch \| blob \| history