# range, with their names prefaced by 'Posix', to signify that these match
# what the Posix standard says they should match. A couple are
# effectively this, but the name doesn't have 'Posix' in it because there
- # just isn't any Posix equivalent.
+ # just isn't any Posix equivalent. 'XPosix' are the Posix tables extended
+ # to the full Unicode range, by our guesses as to what is appropriate.
# 'Any' is all code points. As an error check, instead of just setting it
# to be that, construct it to be the union of all the major categories
$Lower->set_equivalent_to($gc->table('Lowercase_Letter'),
Related => 1);
}
+ $Lower->add_alias('XPosixLower');
$perl->add_match_table("PosixLower",
Description => "[a-z]",
Initialize => $Lower & $ASCII,
$Upper->set_equivalent_to($gc->table('Uppercase_Letter'),
Related => 1);
}
+ $Upper->add_alias('XPosixUpper');
$perl->add_match_table("PosixUpper",
Description => "[A-Z]",
Initialize => $Upper & $ASCII,
$Alpha += $gc->table('Nl') if defined $gc->table('Nl');
$Alpha->add_description('Alphabetic');
}
+ $Alpha->add_alias('XPosixAlpha');
$perl->add_match_table("PosixAlpha",
Description => "[A-Za-z]",
Initialize => $Alpha & $ASCII,
Description => 'Alphabetic and (Decimal) Numeric',
Initialize => $Alpha + $gc->table('Decimal_Number'),
);
+ $Alnum->add_alias('XPosixAlnum');
$perl->add_match_table("PosixAlnum",
Description => "[A-Za-z0-9]",
Initialize => $Alnum & $ASCII,
Description => '\w, including beyond ASCII',
Initialize => $Alnum + $gc->table('Mark'),
);
+ $Word->add_alias('XPosixWord');
my $Pc = $gc->table('Connector_Punctuation'); # 'Pc' Not in release 1
$Word += $Pc if defined $Pc;
# This is a Perl extension, so the name doesn't begin with Posix.
- $perl->add_match_table('PerlWord',
+ my $PerlWord = $perl->add_match_table('PerlWord',
Description => '\w, restricted to ASCII = [A-Za-z0-9_]',
Initialize => $Word & $ASCII,
);
+ $PerlWord->add_alias('PosixWord');
my $Blank = $perl->add_match_table('Blank',
Description => '\h, Horizontal white space',
- 0x200B, # ZWSP
);
$Blank->add_alias('HorizSpace'); # Another name for it.
+ $Blank->add_alias('XPosixBlank');
$perl->add_match_table("PosixBlank",
Description => "\\t and ' '",
Initialize => $Blank & $ASCII,
Description => '\s including beyond ASCII plus vertical tab',
Initialize => $Blank + $VertSpace,
);
+ $Space->add_alias('XPosixSpace');
$perl->add_match_table("PosixSpace",
Description => "\\t, \\n, \\cK, \\f, \\r, and ' '. (\\cK is vertical tab)",
Initialize => $Space & $ASCII,
);
# Perl's traditional space doesn't include Vertical Tab
- my $SpacePerl = $perl->add_match_table('SpacePerl',
+ my $XPerlSpace = $perl->add_match_table('XPerlSpace',
Description => '\s, including beyond ASCII',
Initialize => $Space - 0x000B,
);
- $perl->add_match_table('PerlSpace',
+ $XPerlSpace->add_alias('SpacePerl'); # A pre-existing synonym
+ my $PerlSpace = $perl->add_match_table('PerlSpace',
Description => '\s, restricted to ASCII',
- Initialize => $SpacePerl & $ASCII,
+ Initialize => $XPerlSpace & $ASCII,
);
+
my $Cntrl = $perl->add_match_table('Cntrl',
Description => 'Control characters');
$Cntrl->set_equivalent_to($gc->table('Cc'), Related => 1);
+ $Cntrl->add_alias('XPosixCntrl');
$perl->add_match_table("PosixCntrl",
Description => "ASCII control characters: NUL, SOH, STX, ETX, EOT, ENQ, ACK, BEL, BS, HT, LF, VT, FF, CR, SO, SI, DLE, DC1, DC2, DC3, DC4, NAK, SYN, ETB, CAN, EOM, SUB, ESC, FS, GS, RS, US, and DEL",
Initialize => $Cntrl & $ASCII,
Description => 'Characters that are graphical',
Initialize => ~ ($Space + $controls),
);
+ $Graph->add_alias('XPosixGraph');
$perl->add_match_table("PosixGraph",
Description =>
'[-!"#$%&\'()*+,./:;<>?@[\\\]^_`{|}~0-9A-Za-z]',
Description => 'Characters that are graphical plus space characters (but no controls)',
Initialize => $Blank + $Graph - $gc->table('Control'),
);
+ $print->add_alias('XPosixPrint');
$perl->add_match_table("PosixPrint",
Description =>
'[- 0-9A-Za-z!"#$%&\'()*+,./:;<>?@[\\\]^_`{|}~]',
$Punct->set_equivalent_to($gc->table('Punctuation'), Related => 1);
# \p{punct} doesn't include the symbols, which posix does
+ my $XPosixPunct = $perl->add_match_table('XPosixPunct',
+ Description => '\p{Punct} + ASCII-range \p{Symbol}',
+ Initialize => $gc->table('Punctuation')
+ + ($ASCII & $gc->table('Symbol')),
+ );
$perl->add_match_table('PosixPunct',
Description => '[-!"#$%&\'()*+,./:;<>?@[\\\]^_`{|}~]',
- Initialize => $ASCII & ($gc->table('Punctuation')
- + $gc->table('Symbol')),
+ Initialize => $ASCII & $XPosixPunct,
);
my $Digit = $perl->add_match_table('Digit',
Description => '[0-9] + all other decimal digits');
$Digit->set_equivalent_to($gc->table('Decimal_Number'), Related => 1);
+ $Digit->add_alias('XPosixDigit');
my $PosixDigit = $perl->add_match_table("PosixDigit",
Description => '[0-9]',
Initialize => $Digit & $ASCII,
# Hex_Digit was not present in first release
my $Xdigit = $perl->add_match_table('XDigit');
+ $Xdigit->add_alias('XPosixXDigit');
my $Hex = property_ref('Hex_Digit');
if (defined $Hex && ! $Hex->is_empty) {
$Xdigit->set_equivalent_to($Hex->table('Y'), Related => 1);
0xFF10..0xFF19, 0xFF21..0xFF26, 0xFF41..0xFF46]);
$Xdigit->add_description('[0-9A-Fa-f] and corresponding fullwidth versions, like U+FF10: FULLWIDTH DIGIT ZERO');
}
+ $perl->add_match_table('PosixXDigit',
+ Initialize => $ASCII & $Xdigit,
+ Description => '[0-9A-Fa-f]',
+ );
my $dt = property_ref('Decomposition_Type');
$dt->add_match_table('Non_Canon', Full_Name => 'Non_Canonical',
appropriate characters in the full Unicode character set. For example,
C<\p{Alpha}> will match not just the ASCII alphabetic characters, but any
character in the entire Unicode character set that is considered to be
-alphabetic.
+alphabetic. The backslash sequence column is a (short) synonym for
+the Full-range Unicode form.
(Each of the counterparts has various synonyms as well.
L<perluniprops/Properties accessible through \p{} and \P{}> lists all the
absent, the classes will match only their ASCII-range counterparts. If you
disagree with this proposal, send email to C<perl5-porters@perl.org>.
- [[:...:]] ASCII-range Full-range backslash Note
- Unicode Unicode sequence
+ [[:...:]] ASCII-range Full-range backslash Note
+ Unicode Unicode sequence
-----------------------------------------------------
- alpha \p{PosixAlpha} \p{Alpha}
- alnum \p{PosixAlnum} \p{Alnum}
+ alpha \p{PosixAlpha} \p{XPosixAlpha}
+ alnum \p{PosixAlnum} \p{XPosixAlnum}
ascii \p{ASCII}
- blank \p{PosixBlank} \p{Blank} = [1]
- \p{HorizSpace} \h [1]
- cntrl \p{PosixCntrl} \p{Cntrl} [2]
- digit \p{PosixDigit} \p{Digit} \d
- graph \p{PosixGraph} \p{Graph} [3]
- lower \p{PosixLower} \p{Lower}
- print \p{PosixPrint} \p{Print} [4]
- punct \p{PosixPunct} \p{Punct} [5]
- \p{PerlSpace} \p{SpacePerl} \s [6]
- space \p{PosixSpace} \p{Space} [6]
- upper \p{PosixUpper} \p{Upper}
- word \p{PerlWord} \p{Word} \w
- xdigit \p{ASCII_Hex_Digit} \p{XDigit}
+ blank \p{PosixBlank} \p{XPosixBlank} \h [1]
+ or \p{HorizSpace} [1]
+ cntrl \p{PosixCntrl} \p{XPosixCntrl} [2]
+ digit \p{PosixDigit} \p{XPosixDigit} \d
+ graph \p{PosixGraph} \p{XPosixGraph} [3]
+ lower \p{PosixLower} \p{XPosixLower}
+ print \p{PosixPrint} \p{XPosixPrint} [4]
+ punct \p{PosixPunct} \p{XPosixPunct} [5]
+ \p{PerlSpace} \p{XPerlSpace} \s [6]
+ space \p{PosixSpace} \p{XPosixSpace} [6]
+ upper \p{PosixUpper} \p{XPosixUpper}
+ word \p{PosixWord} \p{XPosixWord} \w
+ xdigit \p{ASCII_Hex_Digit} \p{XPosixXDigit}
=over 4
=back
+There are various other synonyms that can be used for these besides
+C<\p{HorizSpace}> and \C<\p{XPosixBlank}>. For example
+C<\p{PosixAlpha}> can be written as C<\p{Alpha}>. All are listed
+in L<perluniprops/Properties accessible through \p{} and \P{}>.
+
=head4 Negation
X<character class, negation>
POSIX ASCII-range Full-range backslash
Unicode Unicode sequence
-----------------------------------------------------
- [[:^digit:]] \P{PosixDigit} \P{Digit} \D
- [[:^space:]] \P{PosixSpace} \P{Space}
- \P{PerlSpace} \P{SpacePerl} \S
- [[:^word:]] \P{PerlWord} \P{Word} \W
+ [[:^digit:]] \P{PosixDigit} \P{XPosixDigit} \D
+ [[:^space:]] \P{PosixSpace} \P{XPosixSpace}
+ \P{PerlSpace} \P{XPerlSpace} \S
+ [[:^word:]] \P{PerlWord} \P{XPosixWord} \W
+
+Again, the backslash sequence means Full-range Unicode.
=head4 [= =] and [. .]