From: Karl Williamson Date: Sat, 30 Oct 2010 16:13:48 +0000 (-0600) Subject: Add consistent synonyms for \p{PosxFOO} X-Git-Tag: upstream/5.16.3~7048 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=d5944336d74c819152158dabfd806d49ad0ecb21;p=platform%2Fupstream%2Fperl.git Add consistent synonyms for \p{PosxFOO} This patch adds a set of synonyms \p{XPosixFOO} for the full extended Unicode version of \p{PosixFOO}, so only one rule need be remembered. Similarly, \p{XPerlSpace} is added to preserve the rule for the one similar class that doesn't have Posix in its name. --- diff --git a/lib/unicore/mktables b/lib/unicore/mktables index c4328091b2..8a5c89a3ce 100644 --- a/lib/unicore/mktables +++ b/lib/unicore/mktables @@ -11130,7 +11130,8 @@ sub compile_perl() { # range, with their names prefaced by 'Posix', to signify that these match # what the Posix standard says they should match. A couple are # effectively this, but the name doesn't have 'Posix' in it because there - # just isn't any Posix equivalent. + # just isn't any Posix equivalent. 'XPosix' are the Posix tables extended + # to the full Unicode range, by our guesses as to what is appropriate. # 'Any' is all code points. As an error check, instead of just setting it # to be that, construct it to be the union of all the major categories @@ -11195,6 +11196,7 @@ sub compile_perl() { $Lower->set_equivalent_to($gc->table('Lowercase_Letter'), Related => 1); } + $Lower->add_alias('XPosixLower'); $perl->add_match_table("PosixLower", Description => "[a-z]", Initialize => $Lower & $ASCII, @@ -11209,6 +11211,7 @@ sub compile_perl() { $Upper->set_equivalent_to($gc->table('Uppercase_Letter'), Related => 1); } + $Upper->add_alias('XPosixUpper'); $perl->add_match_table("PosixUpper", Description => "[A-Z]", Initialize => $Upper & $ASCII, @@ -11303,6 +11306,7 @@ sub compile_perl() { $Alpha += $gc->table('Nl') if defined $gc->table('Nl'); $Alpha->add_description('Alphabetic'); } + $Alpha->add_alias('XPosixAlpha'); $perl->add_match_table("PosixAlpha", Description => "[A-Za-z]", Initialize => $Alpha & $ASCII, @@ -11312,6 +11316,7 @@ sub compile_perl() { Description => 'Alphabetic and (Decimal) Numeric', Initialize => $Alpha + $gc->table('Decimal_Number'), ); + $Alnum->add_alias('XPosixAlnum'); $perl->add_match_table("PosixAlnum", Description => "[A-Za-z0-9]", Initialize => $Alnum & $ASCII, @@ -11321,14 +11326,16 @@ sub compile_perl() { Description => '\w, including beyond ASCII', Initialize => $Alnum + $gc->table('Mark'), ); + $Word->add_alias('XPosixWord'); my $Pc = $gc->table('Connector_Punctuation'); # 'Pc' Not in release 1 $Word += $Pc if defined $Pc; # This is a Perl extension, so the name doesn't begin with Posix. - $perl->add_match_table('PerlWord', + my $PerlWord = $perl->add_match_table('PerlWord', Description => '\w, restricted to ASCII = [A-Za-z0-9_]', Initialize => $Word & $ASCII, ); + $PerlWord->add_alias('PosixWord'); my $Blank = $perl->add_match_table('Blank', Description => '\h, Horizontal white space', @@ -11341,6 +11348,7 @@ sub compile_perl() { - 0x200B, # ZWSP ); $Blank->add_alias('HorizSpace'); # Another name for it. + $Blank->add_alias('XPosixBlank'); $perl->add_match_table("PosixBlank", Description => "\\t and ' '", Initialize => $Blank & $ASCII, @@ -11362,24 +11370,28 @@ sub compile_perl() { Description => '\s including beyond ASCII plus vertical tab', Initialize => $Blank + $VertSpace, ); + $Space->add_alias('XPosixSpace'); $perl->add_match_table("PosixSpace", Description => "\\t, \\n, \\cK, \\f, \\r, and ' '. (\\cK is vertical tab)", Initialize => $Space & $ASCII, ); # Perl's traditional space doesn't include Vertical Tab - my $SpacePerl = $perl->add_match_table('SpacePerl', + my $XPerlSpace = $perl->add_match_table('XPerlSpace', Description => '\s, including beyond ASCII', Initialize => $Space - 0x000B, ); - $perl->add_match_table('PerlSpace', + $XPerlSpace->add_alias('SpacePerl'); # A pre-existing synonym + my $PerlSpace = $perl->add_match_table('PerlSpace', Description => '\s, restricted to ASCII', - Initialize => $SpacePerl & $ASCII, + Initialize => $XPerlSpace & $ASCII, ); + my $Cntrl = $perl->add_match_table('Cntrl', Description => 'Control characters'); $Cntrl->set_equivalent_to($gc->table('Cc'), Related => 1); + $Cntrl->add_alias('XPosixCntrl'); $perl->add_match_table("PosixCntrl", Description => "ASCII control characters: NUL, SOH, STX, ETX, EOT, ENQ, ACK, BEL, BS, HT, LF, VT, FF, CR, SO, SI, DLE, DC1, DC2, DC3, DC4, NAK, SYN, ETB, CAN, EOM, SUB, ESC, FS, GS, RS, US, and DEL", Initialize => $Cntrl & $ASCII, @@ -11396,6 +11408,7 @@ sub compile_perl() { Description => 'Characters that are graphical', Initialize => ~ ($Space + $controls), ); + $Graph->add_alias('XPosixGraph'); $perl->add_match_table("PosixGraph", Description => '[-!"#$%&\'()*+,./:;<>?@[\\\]^_`{|}~0-9A-Za-z]', @@ -11406,6 +11419,7 @@ sub compile_perl() { Description => 'Characters that are graphical plus space characters (but no controls)', Initialize => $Blank + $Graph - $gc->table('Control'), ); + $print->add_alias('XPosixPrint'); $perl->add_match_table("PosixPrint", Description => '[- 0-9A-Za-z!"#$%&\'()*+,./:;<>?@[\\\]^_`{|}~]', @@ -11416,15 +11430,20 @@ sub compile_perl() { $Punct->set_equivalent_to($gc->table('Punctuation'), Related => 1); # \p{punct} doesn't include the symbols, which posix does + my $XPosixPunct = $perl->add_match_table('XPosixPunct', + Description => '\p{Punct} + ASCII-range \p{Symbol}', + Initialize => $gc->table('Punctuation') + + ($ASCII & $gc->table('Symbol')), + ); $perl->add_match_table('PosixPunct', Description => '[-!"#$%&\'()*+,./:;<>?@[\\\]^_`{|}~]', - Initialize => $ASCII & ($gc->table('Punctuation') - + $gc->table('Symbol')), + Initialize => $ASCII & $XPosixPunct, ); my $Digit = $perl->add_match_table('Digit', Description => '[0-9] + all other decimal digits'); $Digit->set_equivalent_to($gc->table('Decimal_Number'), Related => 1); + $Digit->add_alias('XPosixDigit'); my $PosixDigit = $perl->add_match_table("PosixDigit", Description => '[0-9]', Initialize => $Digit & $ASCII, @@ -11432,6 +11451,7 @@ sub compile_perl() { # Hex_Digit was not present in first release my $Xdigit = $perl->add_match_table('XDigit'); + $Xdigit->add_alias('XPosixXDigit'); my $Hex = property_ref('Hex_Digit'); if (defined $Hex && ! $Hex->is_empty) { $Xdigit->set_equivalent_to($Hex->table('Y'), Related => 1); @@ -11443,6 +11463,10 @@ sub compile_perl() { 0xFF10..0xFF19, 0xFF21..0xFF26, 0xFF41..0xFF46]); $Xdigit->add_description('[0-9A-Fa-f] and corresponding fullwidth versions, like U+FF10: FULLWIDTH DIGIT ZERO'); } + $perl->add_match_table('PosixXDigit', + Initialize => $ASCII & $Xdigit, + Description => '[0-9A-Fa-f]', + ); my $dt = property_ref('Decomposition_Type'); $dt->add_match_table('Non_Canon', Full_Name => 'Non_Canonical', diff --git a/pod/perlrecharclass.pod b/pod/perlrecharclass.pod index 0b88cc46a5..7f96b4b5ea 100644 --- a/pod/perlrecharclass.pod +++ b/pod/perlrecharclass.pod @@ -522,7 +522,8 @@ The other counterpart, in the column labelled "Full-range Unicode", matches any appropriate characters in the full Unicode character set. For example, C<\p{Alpha}> will match not just the ASCII alphabetic characters, but any character in the entire Unicode character set that is considered to be -alphabetic. +alphabetic. The backslash sequence column is a (short) synonym for +the Full-range Unicode form. (Each of the counterparts has various synonyms as well. L lists all the @@ -548,25 +549,25 @@ EBCDIC code page is present, they will behave in accordance with those; if absent, the classes will match only their ASCII-range counterparts. If you disagree with this proposal, send email to C. - [[:...:]] ASCII-range Full-range backslash Note - Unicode Unicode sequence + [[:...:]] ASCII-range Full-range backslash Note + Unicode Unicode sequence ----------------------------------------------------- - alpha \p{PosixAlpha} \p{Alpha} - alnum \p{PosixAlnum} \p{Alnum} + alpha \p{PosixAlpha} \p{XPosixAlpha} + alnum \p{PosixAlnum} \p{XPosixAlnum} ascii \p{ASCII} - blank \p{PosixBlank} \p{Blank} = [1] - \p{HorizSpace} \h [1] - cntrl \p{PosixCntrl} \p{Cntrl} [2] - digit \p{PosixDigit} \p{Digit} \d - graph \p{PosixGraph} \p{Graph} [3] - lower \p{PosixLower} \p{Lower} - print \p{PosixPrint} \p{Print} [4] - punct \p{PosixPunct} \p{Punct} [5] - \p{PerlSpace} \p{SpacePerl} \s [6] - space \p{PosixSpace} \p{Space} [6] - upper \p{PosixUpper} \p{Upper} - word \p{PerlWord} \p{Word} \w - xdigit \p{ASCII_Hex_Digit} \p{XDigit} + blank \p{PosixBlank} \p{XPosixBlank} \h [1] + or \p{HorizSpace} [1] + cntrl \p{PosixCntrl} \p{XPosixCntrl} [2] + digit \p{PosixDigit} \p{XPosixDigit} \d + graph \p{PosixGraph} \p{XPosixGraph} [3] + lower \p{PosixLower} \p{XPosixLower} + print \p{PosixPrint} \p{XPosixPrint} [4] + punct \p{PosixPunct} \p{XPosixPunct} [5] + \p{PerlSpace} \p{XPerlSpace} \s [6] + space \p{PosixSpace} \p{XPosixSpace} [6] + upper \p{PosixUpper} \p{XPosixUpper} + word \p{PosixWord} \p{XPosixWord} \w + xdigit \p{ASCII_Hex_Digit} \p{XPosixXDigit} =over 4 @@ -621,6 +622,11 @@ matches the vertical tab, C<\cK>. Same for the two ASCII-only range forms. =back +There are various other synonyms that can be used for these besides +C<\p{HorizSpace}> and \C<\p{XPosixBlank}>. For example +C<\p{PosixAlpha}> can be written as C<\p{Alpha}>. All are listed +in L. + =head4 Negation X @@ -631,10 +637,12 @@ Some examples: POSIX ASCII-range Full-range backslash Unicode Unicode sequence ----------------------------------------------------- - [[:^digit:]] \P{PosixDigit} \P{Digit} \D - [[:^space:]] \P{PosixSpace} \P{Space} - \P{PerlSpace} \P{SpacePerl} \S - [[:^word:]] \P{PerlWord} \P{Word} \W + [[:^digit:]] \P{PosixDigit} \P{XPosixDigit} \D + [[:^space:]] \P{PosixSpace} \P{XPosixSpace} + \P{PerlSpace} \P{XPerlSpace} \S + [[:^word:]] \P{PerlWord} \P{XPosixWord} \W + +Again, the backslash sequence means Full-range Unicode. =head4 [= =] and [. .]