$char =~ /[\200-\237]/;
}
- sub is_latin_1 {
+ sub is_latin_1 { # But not ASCII; not C1
my $char = substr(shift,0,1);
$char =~ /[\240-\377]/;
}
-The above would be adequate if the concern was only with numeric code points.
-However, the concern may be with characters rather than code points
-and on an EBCDIC platform it may be desirable for constructs such as
-C<if (is_print_ascii("A")) {print "A is a printable character\n";}> to print
-out the expected message. One way to represent the above collection
-of character classification subs that is capable of working across the
-four coded character sets discussed in this document is as follows:
+These are valid only on ASCII platforms, but can be easily rewritten to
+work on any platform as follows:
sub Is_c0 {
my $char = substr(shift,0,1);
- if (ord('^')==94) { # ascii
- return $char =~ /[\000-\037]/;
- }
- if (ord('^')==176) { # 0037
- return $char =~ /[\000-\003\067\055-\057\026\005\045\013-\023\074\075\062\046\030\031\077\047\034-\037]/;
- }
- if (ord('^')==95 || ord('^')==106) { # 1047 || posix-bc
- return $char =~ /[\000-\003\067\055-\057\026\005\025\013-\023\074\075\062\046\030\031\077\047\034-\037]/;
- }
+ return $char =~ /[[:cntrl:]]/
+ && $char =~ /[[:ascii:]]/
+ && ! Is_delete($char);
}
sub Is_print_ascii {
my $char = substr(shift,0,1);
- $char =~ /[ !"\#\$%&'()*+,\-.\/0-9:;<=>?\@A-Z[\\\]^_`a-z{|}~]/;
+
+ return $char =~ /[[:print:]]/ && $char =~ /[[:ascii:]]/;
+
+ # Alternatively:
+ # return $char
+ # =~ /[ !"\#\$%&'()*+,\-.\/0-9:;<=>?\@A-Z[\\\]^_`a-z{|}~]/;
}
sub Is_delete {
my $char = substr(shift,0,1);
- if (ord('^')==94) { # ascii
- return $char eq "\177";
- }
- else { # ebcdic
- return $char eq "\007";
- }
+ return utf8::native_to_unicode(ord $char) == 0x7F;
}
sub Is_c1 {
+ use feature 'unicode_strings';
my $char = substr(shift,0,1);
- if (ord('^')==94) { # ascii
- return $char =~ /[\200-\237]/;
- }
- if (ord('^')==176) { # 0037
- return $char =~ /[\040-\044\025\006\027\050-\054\011\012\033\060\061\032\063-\066\010\070-\073\040\024\076\377]/;
- }
- if (ord('^')==95) { # 1047
- return $char =~ /[\040-\045\006\027\050-\054\011\012\033\060\061\032\063-\066\010\070-\073\040\024\076\377]/;
- }
- if (ord('^')==106) { # posix-bc
- return $char =~
- /[\040-\045\006\027\050-\054\011\012\033\060\061\032\063-\066\010\070-\073\040\024\076\137]/;
- }
+ return $char =~ /[[:cntrl:]]/ && $char !~ /[[:ascii:]]/;
}
- sub Is_latin_1 {
+ sub Is_latin_1 { # But not ASCII; not C1
+ use feature 'unicode_strings';
my $char = substr(shift,0,1);
- if (ord('^')==94) { # ascii
- return $char =~ /[\240-\377]/;
- }
- if (ord('^')==176) { # 0037
- return $char =~
- /[\101\252\112\261\237\262\152\265\275\264\232\212\137\312\257\274\220\217\352\372\276\240\266\263\235\332\233\213\267\270\271\253\144\145\142\146\143\147\236\150\164\161-\163\170\165-\167\254\151\355\356\353\357\354\277\200\375\376\373\374\255\256\131\104\105\102\106\103\107\234\110\124\121-\123\130\125-\127\214\111\315\316\313\317\314\341\160\335\336\333\334\215\216\337]/;
- }
- if (ord('^')==95) { # 1047
- return $char =~
- /[\101\252\112\261\237\262\152\265\273\264\232\212\260\312\257\274\220\217\352\372\276\240\266\263\235\332\233\213\267\270\271\253\144\145\142\146\143\147\236\150\164\161-\163\170\165-\167\254\151\355\356\353\357\354\277\200\375\376\373\374\272\256\131\104\105\102\106\103\107\234\110\124\121-\123\130\125-\127\214\111\315\316\313\317\314\341\160\335\336\333\334\215\216\337]/;
- }
- if (ord('^')==106) { # posix-bc
- return $char =~
- /[\101\252\260\261\237\262\320\265\171\264\232\212\272\312\257\241\220\217\352\372\276\240\266\263\235\332\233\213\267\270\271\253\144\145\142\146\143\147\236\150\164\161-\163\170\165-\167\254\151\355\356\353\357\354\277\200\340\376\335\374\255\256\131\104\105\102\106\103\107\234\110\124\121-\123\130\125-\127\214\111\315\316\313\317\314\341\160\300\336\333\334\215\216\337]/;
- }
+ return ord($char) < 256
+ && $char !~ [[:ascii:]]
+ && $char !~ [[:cntrl:]];
}
-Note however that only the C<Is_ascii_print()> sub is really independent
-of coded character set. Another way to write C<Is_latin_1()> would be
+Another way to write C<Is_latin_1()> would be
to use the characters in the range explicitly:
sub Is_latin_1 {
);
# The following regular expression does not address the
# mappings for: ('.' => '%2E', '/' => '%2F', ':' => '%3A')
- $url =~ s/([\t "#%&\(\),;<=>\?\@\[\\\]^`{|}~])/sprintf("%%%02X",$e2a_1047[ord($1)])/ge;
+ $url =~ s/([\t "#%&\(\),;<=>\?\@\[\\\]^`{|}~])/
+ sprintf("%%%02X",$e2a_1047[ord($1)])/xge;
where a more complete solution would split the URL into components
and apply a full s/// substitution only to the appropriate parts.
@e2a = # EBCDIC to ASCII map (as shown above)
}
$qp_string =~
- s/([^ !"\#\$%&'()*+,\-.\/0-9:;<>?\@A-Z[\\\]^_`a-z{|}~$delete])/sprintf("=%02X",$e2a[ord($1)])/ge;
+ s/([^ !"\#\$%&'()*+,\-.\/0-9:;<>?\@A-Z[\\\]^_`a-z{|}~$delete])/
+ sprintf("=%02X",$e2a[ord($1)])/xge;
(although in production code the substitutions might be done
in the EBCDIC branch with the @e2a array and separately in the