From 8ada0baa1f731edbe470a7630cfeb30c131b4672 Mon Sep 17 00:00:00 2001 From: Jarkko Hietaniemi Date: Wed, 12 Aug 1998 18:42:35 +0300 Subject: [PATCH] apply minimal variant of patch (sent via private mail) Message-Id: <199808121242.PAA29761@comanche.spices> Subject: [PATCH] 5.004_02 or 5.005_51: fix regexp and tr character ranges in non-ASCII lands p4raw-id: //depot/perl@1803 --- MANIFEST | 1 + perl.h | 6 ++++++ pod/perllocale.pod | 2 +- pod/perlop.pod | 8 ++++++++ pod/perlre.pod | 7 +++++++ regcomp.c | 21 ++++++++++++++++++--- t/pragma/locale.t | 6 +++++- toke.c | 25 +++++++++++++++++++++---- 8 files changed, 67 insertions(+), 9 deletions(-) diff --git a/MANIFEST b/MANIFEST index 91147fd..966cc80 100644 --- a/MANIFEST +++ b/MANIFEST @@ -1116,6 +1116,7 @@ t/op/tie.t See if tie/untie functions work t/op/tiearray.t See if tie for arrays works t/op/tiehandle.t See if tie for handles works t/op/time.t See if time functions work +t/op/tr.t See if tr works t/op/undef.t See if undef works t/op/universal.t See if UNIVERSAL class works t/op/unshift.t See if unshift works diff --git a/perl.h b/perl.h index 0f7fe6d..c406f38 100644 --- a/perl.h +++ b/perl.h @@ -209,6 +209,12 @@ register struct op *op asm(stringify(OP_IN_REGISTER)); # define LIBERAL 1 #endif +#if 'A' == 65 && 'I' == 73 && 'J' == 74 && 'Z' == 90 +#define ASCIIish +#else +#undef ASCIIish +#endif + /* * The following contortions are brought to you on behalf of all the * standards, semi-standards, de facto standards, not-so-de-facto standards diff --git a/pod/perllocale.pod b/pod/perllocale.pod index 4401be2..0a85c0e 100644 --- a/pod/perllocale.pod +++ b/pod/perllocale.pod @@ -710,7 +710,7 @@ case-mapping with C<\l>, C<\L>,C<\u> or C<\U>. =item B (sprintf()): -Result is tainted if "use locale" is in effect. +Result is tainted if C is in effect. =item B (printf() and write()): diff --git a/pod/perlop.pod b/pod/perlop.pod index 35f9e5f..8e50ec3 100644 --- a/pod/perlop.pod +++ b/pod/perlop.pod @@ -1149,6 +1149,7 @@ the number of characters replaced or deleted. If no string is specified via the =~ or !~ operator, the $_ string is transliterated. (The string specified with =~ must be a scalar variable, an array element, a hash element, or an assignment to one of those, i.e., an lvalue.) + A character range may be specified with a hyphen, so C does the same replacement as C. For B devotees, C is provided as a synonym for C. If the @@ -1156,6 +1157,13 @@ SEARCHLIST is delimited by bracketing quotes, the REPLACEMENTLIST has its own pair of quotes, which may or may not be bracketing quotes, e.g., C or C. +Note also that the whole range idea is rather unportable between +character sets--and even within character sets they may cause results +you probably didn't expect. A sound principle is to use only ranges +that begin from and end at either alphabets of equal case (a-e, A-E), +or digits (0-4). Anything else is unsafe. If in doubt, spell out the +character sets in full. + Options: c Complement the SEARCHLIST. diff --git a/pod/perlre.pod b/pod/perlre.pod index 1b49ba4..f696525 100644 --- a/pod/perlre.pod +++ b/pod/perlre.pod @@ -735,6 +735,13 @@ following all specify the same class of three characters: C<[-az]>, C<[az-]>, and C<[a\-z]>. All are different from C<[a-z]>, which specifies a class containing twenty-six characters.) +Note also that the whole range idea is rather unportable between +character sets--and even within character sets they may cause results +you probably didn't expect. A sound principle is to use only ranges +that begin from and end at either alphabets of equal case ([a-e], +[A-E]), or digits ([0-9]). Anything else is unsafe. If in doubt, +spell out the character sets in full. + Characters may be specified using a metacharacter syntax much like that used in C: "\n" matches a newline, "\t" a tab, "\r" a carriage return, "\f" a form feed, etc. More generally, \I, where I is a string diff --git a/regcomp.c b/regcomp.c index ef651fa..8db8b8a 100644 --- a/regcomp.c +++ b/regcomp.c @@ -2245,9 +2245,24 @@ regclass(void) } } if (!SIZE_ONLY) { - for ( ; lastvalue <= value; lastvalue++) - ANYOF_SET(opnd, lastvalue); - } +#ifndef ASCIIish + if ((isLOWER(lastvalue) && isLOWER(value)) || + (isUPPER(lastvalue) && isUPPER(value))) { + if (isLOWER(lastvalue)) { + for (i = lastvalue; i <= value; i++) + if (isLOWER(i)) + ANYOF_SET(opnd, i); + } else { + for (i = lastvalue; i <= value; i++) + if (isUPPER(i)) + ANYOF_SET(opnd, i); + } + } + else +#endif + for ( ; lastvalue <= value; lastvalue++) + ANYOF_SET(opnd, lastvalue); + } lastvalue = value; } /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */ diff --git a/t/pragma/locale.t b/t/pragma/locale.t index 00baa66..7e3df8c 100755 --- a/t/pragma/locale.t +++ b/t/pragma/locale.t @@ -23,6 +23,9 @@ eval { # and mingw32 uses said silly CRT $have_setlocale = 0 if $^O eq 'MSWin32' && $Config{cc} =~ /^(cl|gcc)/i; +# 103 (the last test) may fail but that is okay. +# (It indicates something broken in the environment, not Perl) +# Therefore .. only until 102, not 103. print "1..", ($have_setlocale ? 102 : 98), "\n"; use vars qw($a @@ -404,6 +407,7 @@ print "ok 101\n"; # Test for read-onlys. +print "# testing 102\n"; { no locale; $a = "qwerty"; @@ -419,7 +423,7 @@ print "ok 102\n"; # Thanks to Hallvard Furuseth # for inventing a way to test for ordering consistency # without requiring any particular order. -# ++$jhi;#@iki.fi +# print "# testing 103\n"; { diff --git a/toke.c b/toke.c index 2381be3..90fc7b8 100644 --- a/toke.c +++ b/toke.c @@ -904,6 +904,7 @@ scan_const(char *start) /* expand a range A-Z to the full set of characters. AIE! */ if (dorange) { I32 i; /* current expanded character */ + I32 min; /* first character in range */ I32 max; /* last character in range */ i = d - SvPVX(sv); /* remember current offset */ @@ -911,10 +912,26 @@ scan_const(char *start) d = SvPVX(sv) + i; /* restore d after the grow potentially has changed the ptr */ d -= 2; /* eat the first char and the - */ - max = (U8)d[1]; /* last char in range */ - - for (i = (U8)*d; i <= max; i++) - *d++ = i; + min = (U8)*d; /* first char in range */ + max = (U8)d[1]; /* last char in range */ + +#ifndef ASCIIish + if ((isLOWER(min) && isLOWER(max)) || + (isUPPER(min) && isUPPER(max))) { + if (isLOWER(min)) { + for (i = min; i <= max; i++) + if (isLOWER(i)) + *d++ = i; + } else { + for (i = min; i <= max; i++) + if (isUPPER(i)) + *d++ = i; + } + } + else +#endif + for (i = min; i <= max; i++) + *d++ = i; /* mark the range as done, and continue */ dorange = FALSE; -- 2.7.4