# range, with their names prefaced by 'Posix', to signify that these match
# what the Posix standard says they should match. A couple are
# effectively this, but the name doesn't have 'Posix' in it because there
- # just isn't any Posix equivalent.
+ # just isn't any Posix equivalent. 'XPosix' are the Posix tables extended
+ # to the full Unicode range, by our guesses as to what is appropriate.
# 'Any' is all code points. As an error check, instead of just setting it
# to be that, construct it to be the union of all the major categories
$Lower->set_equivalent_to($gc->table('Lowercase_Letter'),
Related => 1);
}
+ $Lower->add_alias('XPosixLower');
$perl->add_match_table("PosixLower",
Description => "[a-z]",
Initialize => $Lower & $ASCII,
$Upper->set_equivalent_to($gc->table('Uppercase_Letter'),
Related => 1);
}
+ $Upper->add_alias('XPosixUpper');
$perl->add_match_table("PosixUpper",
Description => "[A-Z]",
Initialize => $Upper & $ASCII,
$Alpha += $gc->table('Nl') if defined $gc->table('Nl');
$Alpha->add_description('Alphabetic');
}
+ $Alpha->add_alias('XPosixAlpha');
$perl->add_match_table("PosixAlpha",
Description => "[A-Za-z]",
Initialize => $Alpha & $ASCII,
Description => 'Alphabetic and (Decimal) Numeric',
Initialize => $Alpha + $gc->table('Decimal_Number'),
);
+ $Alnum->add_alias('XPosixAlnum');
$perl->add_match_table("PosixAlnum",
Description => "[A-Za-z0-9]",
Initialize => $Alnum & $ASCII,
Description => '\w, including beyond ASCII',
Initialize => $Alnum + $gc->table('Mark'),
);
+ $Word->add_alias('XPosixWord');
my $Pc = $gc->table('Connector_Punctuation'); # 'Pc' Not in release 1
$Word += $Pc if defined $Pc;
# This is a Perl extension, so the name doesn't begin with Posix.
- $perl->add_match_table('PerlWord',
+ my $PerlWord = $perl->add_match_table('PerlWord',
Description => '\w, restricted to ASCII = [A-Za-z0-9_]',
Initialize => $Word & $ASCII,
);
+ $PerlWord->add_alias('PosixWord');
my $Blank = $perl->add_match_table('Blank',
Description => '\h, Horizontal white space',
- 0x200B, # ZWSP
);
$Blank->add_alias('HorizSpace'); # Another name for it.
+ $Blank->add_alias('XPosixBlank');
$perl->add_match_table("PosixBlank",
Description => "\\t and ' '",
Initialize => $Blank & $ASCII,
Description => '\s including beyond ASCII plus vertical tab',
Initialize => $Blank + $VertSpace,
);
+ $Space->add_alias('XPosixSpace');
$perl->add_match_table("PosixSpace",
Description => "\\t, \\n, \\cK, \\f, \\r, and ' '. (\\cK is vertical tab)",
Initialize => $Space & $ASCII,
);
# Perl's traditional space doesn't include Vertical Tab
- my $SpacePerl = $perl->add_match_table('SpacePerl',
+ my $XPerlSpace = $perl->add_match_table('XPerlSpace',
Description => '\s, including beyond ASCII',
Initialize => $Space - 0x000B,
);
- $perl->add_match_table('PerlSpace',
+ $XPerlSpace->add_alias('SpacePerl'); # A pre-existing synonym
+ my $PerlSpace = $perl->add_match_table('PerlSpace',
Description => '\s, restricted to ASCII',
- Initialize => $SpacePerl & $ASCII,
+ Initialize => $XPerlSpace & $ASCII,
);
+
my $Cntrl = $perl->add_match_table('Cntrl',
Description => 'Control characters');
$Cntrl->set_equivalent_to($gc->table('Cc'), Related => 1);
+ $Cntrl->add_alias('XPosixCntrl');
$perl->add_match_table("PosixCntrl",
Description => "ASCII control characters: NUL, SOH, STX, ETX, EOT, ENQ, ACK, BEL, BS, HT, LF, VT, FF, CR, SO, SI, DLE, DC1, DC2, DC3, DC4, NAK, SYN, ETB, CAN, EOM, SUB, ESC, FS, GS, RS, US, and DEL",
Initialize => $Cntrl & $ASCII,
Description => 'Characters that are graphical',
Initialize => ~ ($Space + $controls),
);
+ $Graph->add_alias('XPosixGraph');
$perl->add_match_table("PosixGraph",
Description =>
'[-!"#$%&\'()*+,./:;<>?@[\\\]^_`{|}~0-9A-Za-z]',
Description => 'Characters that are graphical plus space characters (but no controls)',
Initialize => $Blank + $Graph - $gc->table('Control'),
);
+ $print->add_alias('XPosixPrint');
$perl->add_match_table("PosixPrint",
Description =>
'[- 0-9A-Za-z!"#$%&\'()*+,./:;<>?@[\\\]^_`{|}~]',
$Punct->set_equivalent_to($gc->table('Punctuation'), Related => 1);
# \p{punct} doesn't include the symbols, which posix does
+ my $XPosixPunct = $perl->add_match_table('XPosixPunct',
+ Description => '\p{Punct} + ASCII-range \p{Symbol}',
+ Initialize => $gc->table('Punctuation')
+ + ($ASCII & $gc->table('Symbol')),
+ );
$perl->add_match_table('PosixPunct',
Description => '[-!"#$%&\'()*+,./:;<>?@[\\\]^_`{|}~]',
- Initialize => $ASCII & ($gc->table('Punctuation')
- + $gc->table('Symbol')),
+ Initialize => $ASCII & $XPosixPunct,
);
my $Digit = $perl->add_match_table('Digit',
Description => '[0-9] + all other decimal digits');
$Digit->set_equivalent_to($gc->table('Decimal_Number'), Related => 1);
+ $Digit->add_alias('XPosixDigit');
my $PosixDigit = $perl->add_match_table("PosixDigit",
Description => '[0-9]',
Initialize => $Digit & $ASCII,
# Hex_Digit was not present in first release
my $Xdigit = $perl->add_match_table('XDigit');
+ $Xdigit->add_alias('XPosixXDigit');
my $Hex = property_ref('Hex_Digit');
if (defined $Hex && ! $Hex->is_empty) {
$Xdigit->set_equivalent_to($Hex->table('Y'), Related => 1);
0xFF10..0xFF19, 0xFF21..0xFF26, 0xFF41..0xFF46]);
$Xdigit->add_description('[0-9A-Fa-f] and corresponding fullwidth versions, like U+FF10: FULLWIDTH DIGIT ZERO');
}
+ $perl->add_match_table('PosixXDigit',
+ Initialize => $ASCII & $Xdigit,
+ Description => '[0-9A-Fa-f]',
+ );
my $dt = property_ref('Decomposition_Type');
$dt->add_match_table('Non_Canon', Full_Name => 'Non_Canonical',
appropriate characters in the full Unicode character set. For example,
C<\p{Alpha}> will match not just the ASCII alphabetic characters, but any
character in the entire Unicode character set that is considered to be
-alphabetic.
+alphabetic. The backslash sequence column is a (short) synonym for
+the Full-range Unicode form.
(Each of the counterparts has various synonyms as well.
L<perluniprops/Properties accessible through \p{} and \P{}> lists all the
Both the C<\p> forms are unaffected by any locale that is in effect, or whether
the string is in UTF-8 format or not, or whether the platform is EBCDIC or not.
In contrast, the POSIX character classes are affected. If the source string is
-in UTF-8 format, the POSIX classes (with the exception of C<[[:punct:]]>, see
-Note [5] below) behave like their "Full-range" Unicode counterparts. If the
+in UTF-8 format, the POSIX classes behave like their "Full-range"
+Unicode counterparts. If the
source string is not in UTF-8 format, and no locale is in effect, and the
platform is not EBCDIC, all the POSIX classes behave like their ASCII-range
counterparts. Otherwise, they behave based on the rules of the locale or
absent, the classes will match only their ASCII-range counterparts. If you
disagree with this proposal, send email to C<perl5-porters@perl.org>.
- [[:...:]] ASCII-range Full-range backslash Note
- Unicode Unicode sequence
+ [[:...:]] ASCII-range Full-range backslash Note
+ Unicode Unicode sequence
-----------------------------------------------------
- alpha \p{PosixAlpha} \p{Alpha}
- alnum \p{PosixAlnum} \p{Alnum}
+ alpha \p{PosixAlpha} \p{XPosixAlpha}
+ alnum \p{PosixAlnum} \p{XPosixAlnum}
ascii \p{ASCII}
- blank \p{PosixBlank} \p{Blank} = [1]
- \p{HorizSpace} \h [1]
- cntrl \p{PosixCntrl} \p{Cntrl} [2]
- digit \p{PosixDigit} \p{Digit} \d
- graph \p{PosixGraph} \p{Graph} [3]
- lower \p{PosixLower} \p{Lower}
- print \p{PosixPrint} \p{Print} [4]
- punct \p{PosixPunct} \p{Punct} [5]
- \p{PerlSpace} \p{SpacePerl} \s [6]
- space \p{PosixSpace} \p{Space} [6]
- upper \p{PosixUpper} \p{Upper}
- word \p{PerlWord} \p{Word} \w
- xdigit \p{ASCII_Hex_Digit} \p{XDigit}
+ blank \p{PosixBlank} \p{XPosixBlank} \h [1]
+ or \p{HorizSpace} [1]
+ cntrl \p{PosixCntrl} \p{XPosixCntrl} [2]
+ digit \p{PosixDigit} \p{XPosixDigit} \d
+ graph \p{PosixGraph} \p{XPosixGraph} [3]
+ lower \p{PosixLower} \p{XPosixLower}
+ print \p{PosixPrint} \p{XPosixPrint} [4]
+ punct \p{PosixPunct} \p{XPosixPunct} [5]
+ \p{PerlSpace} \p{XPerlSpace} \s [6]
+ space \p{PosixSpace} \p{XPosixSpace} [6]
+ upper \p{PosixUpper} \p{XPosixUpper}
+ word \p{PosixWord} \p{XPosixWord} \w
+ xdigit \p{ASCII_Hex_Digit} \p{XPosixXDigit}
=over 4
C<[-!"#$%&'()*+,./:;<=E<gt>?@[\\\]^_`{|}~]> (although if a locale is in effect,
it could alter the behavior of C<[[:punct:]]>).
-C<\p{Punct}> matches a somewhat different set in the ASCII range, namely
+The similarly named property, C<\p{Punct}>, matches a somewhat different
+set in the ASCII range, namely
C<[-!"#%&'()*,./:;?@[\\\]_{}]>. That is, it is missing C<[$+E<lt>=E<gt>^`|~]>.
This is because Unicode splits what POSIX considers to be punctuation into two
categories, Punctuation and Symbols.
-When the matching string is in UTF-8 format, C<[[:punct:]]> matches what it
-matches in the ASCII range, plus what C<\p{Punct}> matches. This is different
+C<\p{PosixPunct>, and when the matching string is in UTF-8 format,
+C<[[:punct:]]>, match what they match in the ASCII range, plus what
+C<\p{Punct}> matches. This is different
than strictly matching according to C<\p{Punct}>. Another way to say it is that
for a UTF-8 string, C<[[:punct:]]> matches all the characters that Unicode
considers to be punctuation, plus all the ASCII-range characters that Unicode
=back
+There are various other synonyms that can be used for these besides
+C<\p{HorizSpace}> and \C<\p{XPosixBlank}>. For example
+C<\p{PosixAlpha}> can be written as C<\p{Alpha}>. All are listed
+in L<perluniprops/Properties accessible through \p{} and \P{}>.
+
=head4 Negation
X<character class, negation>
POSIX ASCII-range Full-range backslash
Unicode Unicode sequence
-----------------------------------------------------
- [[:^digit:]] \P{PosixDigit} \P{Digit} \D
- [[:^space:]] \P{PosixSpace} \P{Space}
- \P{PerlSpace} \P{SpacePerl} \S
- [[:^word:]] \P{PerlWord} \P{Word} \W
+ [[:^digit:]] \P{PosixDigit} \P{XPosixDigit} \D
+ [[:^space:]] \P{PosixSpace} \P{XPosixSpace}
+ \P{PerlSpace} \P{XPerlSpace} \S
+ [[:^word:]] \P{PerlWord} \P{XPosixWord} \W
+
+Again, the backslash sequence means Full-range Unicode.
=head4 [= =] and [. .]
POSIX character classes and their Unicode and Perl equivalents:
- ASCII- Full-
- range range backslash
- POSIX \p{...} \p{} sequence Description
+ ASCII- Full-
+ POSIX range range backslash
+ [[:...:]] \p{...} \p{...} sequence Description
+
-----------------------------------------------------------------------
- alnum PosixAlnum Alnum Alpha plus Digit
- alpha PosixAlpha Alpha Alphabetic characters
- ascii ASCII Any ASCII character
- blank PosixBlank Blank \h Horizontal whitespace;
- full-range also written
- as \p{HorizSpace} (GNU
- extension)
- cntrl PosixCntrl Cntrl Control characters
- digit PosixDigit Digit \d Decimal digits
- graph PosixGraph Graph Alnum plus Punct
- lower PosixLower Lower Lowercase characters
- print PosixPrint Print Graph plus Print, but not
- any Cntrls
- punct PosixPunct Punct These aren't precisely
- equivalent. See NOTE,
- below.
- space PosixSpace Space [\s\cK] Whitespace
- PerlSpace SpacePerl \s Perl's whitespace
- definition
- upper PosixUpper Upper Uppercase characters
- word PerlWord Word \w Alnum plus '_' (Perl
- extension)
- xdigit ASCII_Hex_Digit XDigit Hexadecimal digit,
- ASCII-range is
- [0-9A-Fa-f]
-
-NOTE on C<[[:punct:]]>, C<\p{PosixPunct}> and C<\p{Punct}>:
-In the ASCII range, C<[[:punct:]]> and C<\p{PosixPunct}> match
-C<[-!"#$%&'()*+,./:;<=E<gt>?@[\\\]^_`{|}~]> (although if a locale is in
-effect, it could alter the behavior of C<[[:punct:]]>); and C<\p{Punct}>
-matches C<[-!"#%&'()*,./:;?@[\\\]_{}]>. When matching a UTF-8 string,
-C<[[:punct:]]> matches what it does in the ASCII range, plus what
-C<\p{Punct}> matches. C<\p{Punct}> matches, anything that isn't a
-control, an alphanumeric, a space, nor a symbol.
+ alnum PosixAlnum XPosixAlnum Alpha plus Digit
+ alpha PosixAlpha XPosixAlpha Alphabetic characters
+ ascii ASCII Any ASCII character
+ blank PosixBlank XPosixBlank \h Horizontal whitespace;
+ full-range also
+ written as
+ \p{HorizSpace} (GNU
+ extension)
+ cntrl PosixCntrl XPosixCntrl Control characters
+ digit PosixDigit XPosixDigit \d Decimal digits
+ graph PosixGraph XPosixGraph Alnum plus Punct
+ lower PosixLower XPosixLower Lowercase characters
+ print PosixPrint XPosixPrint Graph plus Print, but
+ not any Cntrls
+ punct PosixPunct XPosixPunct Punctuation and Symbols
+ in ASCII-range; just
+ punct outside it
+ space PosixSpace XPosixSpace [\s\cK] Whitespace
+ PerlSpace XPerlSpace \s Perl's whitespace def'n
+ upper PosixUpper XPosixUpper Uppercase characters
+ word PerlWord XPosixWord \w Alnum + '_' (Perl
+ extension)
+ xdigit ASCII_Hex_Digit XPosixDigit Hexadecimal digit,
+ ASCII-range is
+ [0-9A-Fa-f]
+
+Also, various synonyms like C<\p{Alpha}> for C<\p{XPosixAlpha}>; all listed
+in L<perluniprops/Properties accessible through \p{} and \P{}>
Within a character class: