Add consistent synonyms for \p{PosxFOO}
authorKarl Williamson <public@khwilliamson.com>
Sat, 30 Oct 2010 16:13:48 +0000 (10:13 -0600)
committerFather Chrysostomos <sprout@cpan.org>
Sun, 31 Oct 2010 19:21:05 +0000 (12:21 -0700)
This patch adds a set of synonyms \p{XPosixFOO} for the full extended
Unicode version of \p{PosixFOO}, so only one rule need be remembered.
Similarly, \p{XPerlSpace} is added to preserve the rule for the one
similar class that doesn't have Posix in its name.

Prior to this patch there was no exact equivalent to \p{PosixPunct}
extended beyond ASCII.

lib/unicore/mktables
pod/perlrecharclass.pod
pod/perlreref.pod

index c432809..8a5c89a 100644 (file)
@@ -11130,7 +11130,8 @@ sub compile_perl() {
     # range, with their names prefaced by 'Posix', to signify that these match
     # what the Posix standard says they should match.  A couple are
     # effectively this, but the name doesn't have 'Posix' in it because there
-    # just isn't any Posix equivalent.
+    # just isn't any Posix equivalent.  'XPosix' are the Posix tables extended
+    # to the full Unicode range, by our guesses as to what is appropriate.
 
     # 'Any' is all code points.  As an error check, instead of just setting it
     # to be that, construct it to be the union of all the major categories
@@ -11195,6 +11196,7 @@ sub compile_perl() {
         $Lower->set_equivalent_to($gc->table('Lowercase_Letter'),
                                                                 Related => 1);
     }
+    $Lower->add_alias('XPosixLower');
     $perl->add_match_table("PosixLower",
                             Description => "[a-z]",
                             Initialize => $Lower & $ASCII,
@@ -11209,6 +11211,7 @@ sub compile_perl() {
         $Upper->set_equivalent_to($gc->table('Uppercase_Letter'),
                                                                 Related => 1);
     }
+    $Upper->add_alias('XPosixUpper');
     $perl->add_match_table("PosixUpper",
                             Description => "[A-Z]",
                             Initialize => $Upper & $ASCII,
@@ -11303,6 +11306,7 @@ sub compile_perl() {
         $Alpha += $gc->table('Nl') if defined $gc->table('Nl');
         $Alpha->add_description('Alphabetic');
     }
+    $Alpha->add_alias('XPosixAlpha');
     $perl->add_match_table("PosixAlpha",
                             Description => "[A-Za-z]",
                             Initialize => $Alpha & $ASCII,
@@ -11312,6 +11316,7 @@ sub compile_perl() {
                         Description => 'Alphabetic and (Decimal) Numeric',
                         Initialize => $Alpha + $gc->table('Decimal_Number'),
                         );
+    $Alnum->add_alias('XPosixAlnum');
     $perl->add_match_table("PosixAlnum",
                             Description => "[A-Za-z0-9]",
                             Initialize => $Alnum & $ASCII,
@@ -11321,14 +11326,16 @@ sub compile_perl() {
                                 Description => '\w, including beyond ASCII',
                                 Initialize => $Alnum + $gc->table('Mark'),
                                 );
+    $Word->add_alias('XPosixWord');
     my $Pc = $gc->table('Connector_Punctuation'); # 'Pc' Not in release 1
     $Word += $Pc if defined $Pc;
 
     # This is a Perl extension, so the name doesn't begin with Posix.
-    $perl->add_match_table('PerlWord',
+    my $PerlWord = $perl->add_match_table('PerlWord',
                     Description => '\w, restricted to ASCII = [A-Za-z0-9_]',
                     Initialize => $Word & $ASCII,
                     );
+    $PerlWord->add_alias('PosixWord');
 
     my $Blank = $perl->add_match_table('Blank',
                                 Description => '\h, Horizontal white space',
@@ -11341,6 +11348,7 @@ sub compile_perl() {
                                             -   0x200B, # ZWSP
                                 );
     $Blank->add_alias('HorizSpace');        # Another name for it.
+    $Blank->add_alias('XPosixBlank');
     $perl->add_match_table("PosixBlank",
                             Description => "\\t and ' '",
                             Initialize => $Blank & $ASCII,
@@ -11362,24 +11370,28 @@ sub compile_perl() {
                 Description => '\s including beyond ASCII plus vertical tab',
                 Initialize => $Blank + $VertSpace,
     );
+    $Space->add_alias('XPosixSpace');
     $perl->add_match_table("PosixSpace",
                             Description => "\\t, \\n, \\cK, \\f, \\r, and ' '.  (\\cK is vertical tab)",
                             Initialize => $Space & $ASCII,
                             );
 
     # Perl's traditional space doesn't include Vertical Tab
-    my $SpacePerl = $perl->add_match_table('SpacePerl',
+    my $XPerlSpace = $perl->add_match_table('XPerlSpace',
                                   Description => '\s, including beyond ASCII',
                                   Initialize => $Space - 0x000B,
                                 );
-    $perl->add_match_table('PerlSpace',
+    $XPerlSpace->add_alias('SpacePerl');    # A pre-existing synonym
+    my $PerlSpace = $perl->add_match_table('PerlSpace',
                             Description => '\s, restricted to ASCII',
-                            Initialize => $SpacePerl & $ASCII,
+                            Initialize => $XPerlSpace & $ASCII,
                             );
 
+
     my $Cntrl = $perl->add_match_table('Cntrl',
                                         Description => 'Control characters');
     $Cntrl->set_equivalent_to($gc->table('Cc'), Related => 1);
+    $Cntrl->add_alias('XPosixCntrl');
     $perl->add_match_table("PosixCntrl",
                             Description => "ASCII control characters: NUL, SOH, STX, ETX, EOT, ENQ, ACK, BEL, BS, HT, LF, VT, FF, CR, SO, SI, DLE, DC1, DC2, DC3, DC4, NAK, SYN, ETB, CAN, EOM, SUB, ESC, FS, GS, RS, US, and DEL",
                             Initialize => $Cntrl & $ASCII,
@@ -11396,6 +11408,7 @@ sub compile_perl() {
                         Description => 'Characters that are graphical',
                         Initialize => ~ ($Space + $controls),
                         );
+    $Graph->add_alias('XPosixGraph');
     $perl->add_match_table("PosixGraph",
                             Description =>
                                 '[-!"#$%&\'()*+,./:;<>?@[\\\]^_`{|}~0-9A-Za-z]',
@@ -11406,6 +11419,7 @@ sub compile_perl() {
                         Description => 'Characters that are graphical plus space characters (but no controls)',
                         Initialize => $Blank + $Graph - $gc->table('Control'),
                         );
+    $print->add_alias('XPosixPrint');
     $perl->add_match_table("PosixPrint",
                             Description =>
                               '[- 0-9A-Za-z!"#$%&\'()*+,./:;<>?@[\\\]^_`{|}~]',
@@ -11416,15 +11430,20 @@ sub compile_perl() {
     $Punct->set_equivalent_to($gc->table('Punctuation'), Related => 1);
 
     # \p{punct} doesn't include the symbols, which posix does
+    my $XPosixPunct = $perl->add_match_table('XPosixPunct',
+                    Description => '\p{Punct} + ASCII-range \p{Symbol}',
+                    Initialize => $gc->table('Punctuation')
+                                + ($ASCII & $gc->table('Symbol')),
+        );
     $perl->add_match_table('PosixPunct',
         Description => '[-!"#$%&\'()*+,./:;<>?@[\\\]^_`{|}~]',
-        Initialize => $ASCII & ($gc->table('Punctuation')
-                                + $gc->table('Symbol')),
+        Initialize => $ASCII & $XPosixPunct,
         );
 
     my $Digit = $perl->add_match_table('Digit',
                             Description => '[0-9] + all other decimal digits');
     $Digit->set_equivalent_to($gc->table('Decimal_Number'), Related => 1);
+    $Digit->add_alias('XPosixDigit');
     my $PosixDigit = $perl->add_match_table("PosixDigit",
                                             Description => '[0-9]',
                                             Initialize => $Digit & $ASCII,
@@ -11432,6 +11451,7 @@ sub compile_perl() {
 
     # Hex_Digit was not present in first release
     my $Xdigit = $perl->add_match_table('XDigit');
+    $Xdigit->add_alias('XPosixXDigit');
     my $Hex = property_ref('Hex_Digit');
     if (defined $Hex && ! $Hex->is_empty) {
         $Xdigit->set_equivalent_to($Hex->table('Y'), Related => 1);
@@ -11443,6 +11463,10 @@ sub compile_perl() {
                               0xFF10..0xFF19, 0xFF21..0xFF26, 0xFF41..0xFF46]);
         $Xdigit->add_description('[0-9A-Fa-f] and corresponding fullwidth versions, like U+FF10: FULLWIDTH DIGIT ZERO');
     }
+    $perl->add_match_table('PosixXDigit',
+                            Initialize => $ASCII & $Xdigit,
+                            Description => '[0-9A-Fa-f]',
+                        );
 
     my $dt = property_ref('Decomposition_Type');
     $dt->add_match_table('Non_Canon', Full_Name => 'Non_Canonical',
index 0b88cc4..1a6fd31 100644 (file)
@@ -522,7 +522,8 @@ The other counterpart, in the column labelled "Full-range Unicode", matches any
 appropriate characters in the full Unicode character set.  For example,
 C<\p{Alpha}> will match not just the ASCII alphabetic characters, but any
 character in the entire Unicode character set that is considered to be
-alphabetic.
+alphabetic.  The backslash sequence column is a (short) synonym for
+the Full-range Unicode form.
 
 (Each of the counterparts has various synonyms as well.
 L<perluniprops/Properties accessible through \p{} and \P{}> lists all the
@@ -533,8 +534,8 @@ and any C<\p> property name can be prefixed with "Is" such as C<\p{IsAlpha}>.)
 Both the C<\p> forms are unaffected by any locale that is in effect, or whether
 the string is in UTF-8 format or not, or whether the platform is EBCDIC or not.
 In contrast, the POSIX character classes are affected.  If the source string is
-in UTF-8 format, the POSIX classes (with the exception of C<[[:punct:]]>, see
-Note [5] below) behave like their "Full-range" Unicode counterparts.  If the
+in UTF-8 format, the POSIX classes behave like their "Full-range"
+Unicode counterparts.  If the
 source string is not in UTF-8 format, and no locale is in effect, and the
 platform is not EBCDIC, all the POSIX classes behave like their ASCII-range
 counterparts.  Otherwise, they behave based on the rules of the locale or
@@ -548,25 +549,25 @@ EBCDIC code page is present, they will behave in accordance with those; if
 absent, the classes will match only their ASCII-range counterparts.  If you
 disagree with this proposal, send email to C<perl5-porters@perl.org>.
 
- [[:...:]]      ASCII-range        Full-range  backslash  Note
-                 Unicode            Unicode    sequence
+ [[:...:]]      ASCII-range          Full-range  backslash  Note
+                 Unicode              Unicode     sequence
  -----------------------------------------------------
-   alpha      \p{PosixAlpha}       \p{Alpha}
-   alnum      \p{PosixAlnum}       \p{Alnum}
+   alpha      \p{PosixAlpha}       \p{XPosixAlpha}
+   alnum      \p{PosixAlnum}       \p{XPosixAlnum}
    ascii      \p{ASCII}          
-   blank      \p{PosixBlank}       \p{Blank} =             [1]
-                                   \p{HorizSpace}  \h      [1]
-   cntrl      \p{PosixCntrl}       \p{Cntrl}               [2]
-   digit      \p{PosixDigit}       \p{Digit}       \d
-   graph      \p{PosixGraph}       \p{Graph}               [3]
-   lower      \p{PosixLower}       \p{Lower}
-   print      \p{PosixPrint}       \p{Print}               [4]
-   punct      \p{PosixPunct}       \p{Punct}               [5]
-              \p{PerlSpace}        \p{SpacePerl}   \s      [6]
-   space      \p{PosixSpace}       \p{Space}               [6]
-   upper      \p{PosixUpper}       \p{Upper}
-   word       \p{PerlWord}         \p{Word}        \w
-   xdigit     \p{ASCII_Hex_Digit}  \p{XDigit}
+   blank      \p{PosixBlank}       \p{XPosixBlank}  \h      [1]
+                                   or \p{HorizSpace}        [1]
+   cntrl      \p{PosixCntrl}       \p{XPosixCntrl}          [2]
+   digit      \p{PosixDigit}       \p{XPosixDigit}  \d
+   graph      \p{PosixGraph}       \p{XPosixGraph}          [3]
+   lower      \p{PosixLower}       \p{XPosixLower}
+   print      \p{PosixPrint}       \p{XPosixPrint}          [4]
+   punct      \p{PosixPunct}       \p{XPosixPunct}          [5]
+              \p{PerlSpace}        \p{XPerlSpace}   \s      [6]
+   space      \p{PosixSpace}       \p{XPosixSpace}          [6]
+   upper      \p{PosixUpper}       \p{XPosixUpper}
+   word       \p{PosixWord}        \p{XPosixWord}   \w
+   xdigit     \p{ASCII_Hex_Digit}  \p{XPosixXDigit}
 
 =over 4
 
@@ -602,13 +603,15 @@ non-controls, non-alphanumeric, non-space characters:
 C<[-!"#$%&'()*+,./:;<=E<gt>?@[\\\]^_`{|}~]> (although if a locale is in effect,
 it could alter the behavior of C<[[:punct:]]>).
 
-C<\p{Punct}> matches a somewhat different set in the ASCII range, namely
+The similarly named property, C<\p{Punct}>, matches a somewhat different
+set in the ASCII range, namely
 C<[-!"#%&'()*,./:;?@[\\\]_{}]>.  That is, it is missing C<[$+E<lt>=E<gt>^`|~]>.
 This is because Unicode splits what POSIX considers to be punctuation into two
 categories, Punctuation and Symbols.
 
-When the matching string is in UTF-8 format, C<[[:punct:]]> matches what it
-matches in the ASCII range, plus what C<\p{Punct}> matches.  This is different
+C<\p{PosixPunct>, and when the matching string is in UTF-8 format,
+C<[[:punct:]]>, match what they match in the ASCII range, plus what
+C<\p{Punct}> matches.  This is different
 than strictly matching according to C<\p{Punct}>.  Another way to say it is that
 for a UTF-8 string, C<[[:punct:]]> matches all the characters that Unicode
 considers to be punctuation, plus all the ASCII-range characters that Unicode
@@ -621,6 +624,11 @@ matches the vertical tab, C<\cK>.   Same for the two ASCII-only range forms.
 
 =back
 
+There are various other synonyms that can be used for these besides
+C<\p{HorizSpace}> and \C<\p{XPosixBlank}>.  For example
+C<\p{PosixAlpha}> can be written as C<\p{Alpha}>.  All are listed
+in L<perluniprops/Properties accessible through \p{} and \P{}>.
+
 =head4 Negation
 X<character class, negation>
 
@@ -631,10 +639,12 @@ Some examples:
      POSIX         ASCII-range     Full-range  backslash
                     Unicode         Unicode    sequence
  -----------------------------------------------------
- [[:^digit:]]   \P{PosixDigit}     \P{Digit}      \D
- [[:^space:]]   \P{PosixSpace}     \P{Space}
-                \P{PerlSpace}      \P{SpacePerl}  \S
- [[:^word:]]    \P{PerlWord}       \P{Word}       \W
+ [[:^digit:]]   \P{PosixDigit}  \P{XPosixDigit}   \D
+ [[:^space:]]   \P{PosixSpace}  \P{XPosixSpace}
+                \P{PerlSpace}   \P{XPerlSpace}    \S
+ [[:^word:]]    \P{PerlWord}    \P{XPosixWord}    \W
+
+Again, the backslash sequence means Full-range Unicode.
 
 =head4 [= =] and [. .]
 
index 6e028ee..4805c9b 100644 (file)
@@ -145,44 +145,39 @@ and L<perlunicode> for details.
 
 POSIX character classes and their Unicode and Perl equivalents:
 
-           ASCII-         Full-
-           range          range   backslash
- POSIX    \p{...}         \p{}    sequence       Description
+            ASCII-         Full-
+   POSIX    range          range    backslash
+ [[:...:]]  \p{...}        \p{...}   sequence    Description
+
  -----------------------------------------------------------------------
- alnum   PosixAlnum       Alnum               Alpha plus Digit
- alpha   PosixAlpha       Alpha               Alphabetic characters
- ascii   ASCII                                Any ASCII character
- blank   PosixBlank       Blank     \h        Horizontal whitespace;
-                                                full-range also written
-                                                as \p{HorizSpace} (GNU
-                                                extension)
- cntrl   PosixCntrl       Cntrl               Control characters
- digit   PosixDigit       Digit     \d        Decimal digits
- graph   PosixGraph       Graph               Alnum plus Punct
- lower   PosixLower       Lower               Lowercase characters
- print   PosixPrint       Print               Graph plus Print, but not
-                                                any Cntrls
- punct   PosixPunct       Punct               These aren't precisely
-                                                equivalent.  See NOTE,
-                                                below.
- space   PosixSpace       Space     [\s\cK]   Whitespace
-         PerlSpace        SpacePerl \s        Perl's whitespace
-                                                definition
- upper   PosixUpper       Upper               Uppercase characters
- word    PerlWord         Word      \w        Alnum plus '_' (Perl
-                                                extension)
- xdigit  ASCII_Hex_Digit  XDigit              Hexadecimal digit,
-                                                ASCII-range is
-                                                [0-9A-Fa-f]
-
-NOTE on C<[[:punct:]]>, C<\p{PosixPunct}> and C<\p{Punct}>:
-In the ASCII range, C<[[:punct:]]> and C<\p{PosixPunct}> match
-C<[-!"#$%&'()*+,./:;<=E<gt>?@[\\\]^_`{|}~]> (although if a locale is in
-effect, it could alter the behavior of C<[[:punct:]]>); and C<\p{Punct}>
-matches C<[-!"#%&'()*,./:;?@[\\\]_{}]>.  When matching a UTF-8 string,
-C<[[:punct:]]> matches what it does in the ASCII range, plus what
-C<\p{Punct}> matches.  C<\p{Punct}> matches, anything that isn't a
-control, an alphanumeric, a space, nor a symbol.
+ alnum   PosixAlnum       XPosixAlnum            Alpha plus Digit
+ alpha   PosixAlpha       XPosixAlpha            Alphabetic characters
+ ascii   ASCII                                   Any ASCII character
+ blank   PosixBlank       XPosixBlank   \h       Horizontal whitespace;
+                                                   full-range also
+                                                   written as
+                                                   \p{HorizSpace} (GNU
+                                                   extension)
+ cntrl   PosixCntrl       XPosixCntrl            Control characters
+ digit   PosixDigit       XPosixDigit   \d       Decimal digits
+ graph   PosixGraph       XPosixGraph            Alnum plus Punct
+ lower   PosixLower       XPosixLower            Lowercase characters
+ print   PosixPrint       XPosixPrint            Graph plus Print, but
+                                                   not any Cntrls
+ punct   PosixPunct       XPosixPunct            Punctuation and Symbols
+                                                   in ASCII-range; just
+                                                   punct outside it
+ space   PosixSpace       XPosixSpace   [\s\cK]  Whitespace
+         PerlSpace        XPerlSpace    \s       Perl's whitespace def'n
+ upper   PosixUpper       XPosixUpper            Uppercase characters
+ word    PerlWord         XPosixWord    \w       Alnum + '_' (Perl
+                                                   extension)
+ xdigit  ASCII_Hex_Digit  XPosixDigit            Hexadecimal digit,
+                                                    ASCII-range is
+                                                    [0-9A-Fa-f]
+
+Also, various synonyms like C<\p{Alpha}> for C<\p{XPosixAlpha}>; all listed
+in L<perluniprops/Properties accessible through \p{} and \P{}>
 
 Within a character class: