Add consistent synonyms for \p{PosxFOO}

author Karl Williamson <public@khwilliamson.com>

Sat, 30 Oct 2010 16:13:48 +0000 (10:13 -0600)

committer Father Chrysostomos <sprout@cpan.org>

Sun, 31 Oct 2010 19:21:05 +0000 (12:21 -0700)
author Karl Williamson <public@khwilliamson.com>
Sat, 30 Oct 2010 16:13:48 +0000 (10:13 -0600)
committer Father Chrysostomos <sprout@cpan.org>
Sun, 31 Oct 2010 19:21:05 +0000 (12:21 -0700)
diff --git a/lib/unicore/mktables b/lib/unicore/mktables

index c432809..8a5c89a 100644 (file)
--- a/lib/unicore/mktables
+++ b/lib/unicore/mktables
@@ -11130,7 +11130,8 @@ sub compile_perl() {
      # range, with their names prefaced by 'Posix', to signify that these match
      # what the Posix standard says they should match.  A couple are
      # effectively this, but the name doesn't have 'Posix' in it because there
-    # just isn't any Posix equivalent.
+    # just isn't any Posix equivalent.  'XPosix' are the Posix tables extended
+    # to the full Unicode range, by our guesses as to what is appropriate.
  
      # 'Any' is all code points.  As an error check, instead of just setting it
      # to be that, construct it to be the union of all the major categories
@@ -11195,6 +11196,7 @@ sub compile_perl() {
          $Lower->set_equivalent_to($gc->table('Lowercase_Letter'),
                                                                  Related => 1);
      }
+    $Lower->add_alias('XPosixLower');
      $perl->add_match_table("PosixLower",
                              Description => "[a-z]",
                              Initialize => $Lower & $ASCII,
@@ -11209,6 +11211,7 @@ sub compile_perl() {
          $Upper->set_equivalent_to($gc->table('Uppercase_Letter'),
                                                                  Related => 1);
      }
+    $Upper->add_alias('XPosixUpper');
      $perl->add_match_table("PosixUpper",
                              Description => "[A-Z]",
                              Initialize => $Upper & $ASCII,
@@ -11303,6 +11306,7 @@ sub compile_perl() {
          $Alpha += $gc->table('Nl') if defined $gc->table('Nl');
          $Alpha->add_description('Alphabetic');
      }
+    $Alpha->add_alias('XPosixAlpha');
      $perl->add_match_table("PosixAlpha",
                              Description => "[A-Za-z]",
                              Initialize => $Alpha & $ASCII,
@@ -11312,6 +11316,7 @@ sub compile_perl() {
                          Description => 'Alphabetic and (Decimal) Numeric',
                          Initialize => $Alpha + $gc->table('Decimal_Number'),
                          );
+    $Alnum->add_alias('XPosixAlnum');
      $perl->add_match_table("PosixAlnum",
                              Description => "[A-Za-z0-9]",
                              Initialize => $Alnum & $ASCII,
@@ -11321,14 +11326,16 @@ sub compile_perl() {
                                  Description => '\w, including beyond ASCII',
                                  Initialize => $Alnum + $gc->table('Mark'),
                                  );
+    $Word->add_alias('XPosixWord');
      my $Pc = $gc->table('Connector_Punctuation'); # 'Pc' Not in release 1
      $Word += $Pc if defined $Pc;
  
      # This is a Perl extension, so the name doesn't begin with Posix.
-    $perl->add_match_table('PerlWord',
+    my $PerlWord = $perl->add_match_table('PerlWord',
                      Description => '\w, restricted to ASCII = [A-Za-z0-9_]',
                      Initialize => $Word & $ASCII,
                      );
+    $PerlWord->add_alias('PosixWord');
  
      my $Blank = $perl->add_match_table('Blank',
                                  Description => '\h, Horizontal white space',
@@ -11341,6 +11348,7 @@ sub compile_perl() {
                                              -   0x200B, # ZWSP
                                  );
      $Blank->add_alias('HorizSpace');        # Another name for it.
+    $Blank->add_alias('XPosixBlank');
      $perl->add_match_table("PosixBlank",
                              Description => "\\t and ' '",
                              Initialize => $Blank & $ASCII,
@@ -11362,24 +11370,28 @@ sub compile_perl() {
                  Description => '\s including beyond ASCII plus vertical tab',
                  Initialize => $Blank + $VertSpace,
      );
+    $Space->add_alias('XPosixSpace');
      $perl->add_match_table("PosixSpace",
                              Description => "\\t, \\n, \\cK, \\f, \\r, and ' '.  (\\cK is vertical tab)",
                              Initialize => $Space & $ASCII,
                              );
  
      # Perl's traditional space doesn't include Vertical Tab
-    my $SpacePerl = $perl->add_match_table('SpacePerl',
+    my $XPerlSpace = $perl->add_match_table('XPerlSpace',
                                    Description => '\s, including beyond ASCII',
                                    Initialize => $Space - 0x000B,
                                  );
-    $perl->add_match_table('PerlSpace',
+    $XPerlSpace->add_alias('SpacePerl');    # A pre-existing synonym
+    my $PerlSpace = $perl->add_match_table('PerlSpace',
                              Description => '\s, restricted to ASCII',
-                            Initialize => $SpacePerl & $ASCII,
+                            Initialize => $XPerlSpace & $ASCII,
                              );
  
+
      my $Cntrl = $perl->add_match_table('Cntrl',
                                          Description => 'Control characters');
      $Cntrl->set_equivalent_to($gc->table('Cc'), Related => 1);
+    $Cntrl->add_alias('XPosixCntrl');
      $perl->add_match_table("PosixCntrl",
                              Description => "ASCII control characters: NUL, SOH, STX, ETX, EOT, ENQ, ACK, BEL, BS, HT, LF, VT, FF, CR, SO, SI, DLE, DC1, DC2, DC3, DC4, NAK, SYN, ETB, CAN, EOM, SUB, ESC, FS, GS, RS, US, and DEL",
                              Initialize => $Cntrl & $ASCII,
@@ -11396,6 +11408,7 @@ sub compile_perl() {
                          Description => 'Characters that are graphical',
                          Initialize => ~ ($Space + $controls),
                          );
+    $Graph->add_alias('XPosixGraph');
      $perl->add_match_table("PosixGraph",
                              Description =>
                                  '[-!"#$%&\'()*+,./:;<>?@[\\\]^_`{|}~0-9A-Za-z]',
@@ -11406,6 +11419,7 @@ sub compile_perl() {
                          Description => 'Characters that are graphical plus space characters (but no controls)',
                          Initialize => $Blank + $Graph - $gc->table('Control'),
                          );
+    $print->add_alias('XPosixPrint');
      $perl->add_match_table("PosixPrint",
                              Description =>
                                '[- 0-9A-Za-z!"#$%&\'()*+,./:;<>?@[\\\]^_`{|}~]',
@@ -11416,15 +11430,20 @@ sub compile_perl() {
      $Punct->set_equivalent_to($gc->table('Punctuation'), Related => 1);
  
      # \p{punct} doesn't include the symbols, which posix does
+    my $XPosixPunct = $perl->add_match_table('XPosixPunct',
+                    Description => '\p{Punct} + ASCII-range \p{Symbol}',
+                    Initialize => $gc->table('Punctuation')
+                                + ($ASCII & $gc->table('Symbol')),
+        );
      $perl->add_match_table('PosixPunct',
          Description => '[-!"#$%&\'()*+,./:;<>?@[\\\]^_`{|}~]',
-        Initialize => $ASCII & ($gc->table('Punctuation')
-                                + $gc->table('Symbol')),
+        Initialize => $ASCII & $XPosixPunct,
          );
  
      my $Digit = $perl->add_match_table('Digit',
                              Description => '[0-9] + all other decimal digits');
      $Digit->set_equivalent_to($gc->table('Decimal_Number'), Related => 1);
+    $Digit->add_alias('XPosixDigit');
      my $PosixDigit = $perl->add_match_table("PosixDigit",
                                              Description => '[0-9]',
                                              Initialize => $Digit & $ASCII,
@@ -11432,6 +11451,7 @@ sub compile_perl() {
  
      # Hex_Digit was not present in first release
      my $Xdigit = $perl->add_match_table('XDigit');
+    $Xdigit->add_alias('XPosixXDigit');
      my $Hex = property_ref('Hex_Digit');
      if (defined $Hex && ! $Hex->is_empty) {
          $Xdigit->set_equivalent_to($Hex->table('Y'), Related => 1);
@@ -11443,6 +11463,10 @@ sub compile_perl() {
                                0xFF10..0xFF19, 0xFF21..0xFF26, 0xFF41..0xFF46]);
          $Xdigit->add_description('[0-9A-Fa-f] and corresponding fullwidth versions, like U+FF10: FULLWIDTH DIGIT ZERO');
      }
+    $perl->add_match_table('PosixXDigit',
+                            Initialize => $ASCII & $Xdigit,
+                            Description => '[0-9A-Fa-f]',
+                        );
  
      my $dt = property_ref('Decomposition_Type');
      $dt->add_match_table('Non_Canon', Full_Name => 'Non_Canonical',
diff --git a/pod/perlrecharclass.pod b/pod/perlrecharclass.pod

index 0b88cc4..1a6fd31 100644 (file)
--- a/pod/perlrecharclass.pod
+++ b/pod/perlrecharclass.pod
@@ -522,7 +522,8 @@ The other counterpart, in the column labelled "Full-range Unicode", matches any
  appropriate characters in the full Unicode character set.  For example,
  C<\p{Alpha}> will match not just the ASCII alphabetic characters, but any
  character in the entire Unicode character set that is considered to be
-alphabetic.
+alphabetic.  The backslash sequence column is a (short) synonym for
+the Full-range Unicode form.
  
  (Each of the counterparts has various synonyms as well.
  L<perluniprops/Properties accessible through \p{} and \P{}> lists all the
@@ -533,8 +534,8 @@ and any C<\p> property name can be prefixed with "Is" such as C<\p{IsAlpha}>.)
  Both the C<\p> forms are unaffected by any locale that is in effect, or whether
  the string is in UTF-8 format or not, or whether the platform is EBCDIC or not.
  In contrast, the POSIX character classes are affected.  If the source string is
-in UTF-8 format, the POSIX classes (with the exception of C<[[:punct:]]>, see
-Note [5] below) behave like their "Full-range" Unicode counterparts.  If the
+in UTF-8 format, the POSIX classes behave like their "Full-range"
+Unicode counterparts.  If the
  source string is not in UTF-8 format, and no locale is in effect, and the
  platform is not EBCDIC, all the POSIX classes behave like their ASCII-range
  counterparts.  Otherwise, they behave based on the rules of the locale or
@@ -548,25 +549,25 @@ EBCDIC code page is present, they will behave in accordance with those; if
  absent, the classes will match only their ASCII-range counterparts.  If you
  disagree with this proposal, send email to C<perl5-porters@perl.org>.
  
- [[:...:]]      ASCII-range        Full-range  backslash  Note
-                 Unicode            Unicode    sequence
+ [[:...:]]      ASCII-range          Full-range  backslash  Note
+                 Unicode              Unicode     sequence
   -----------------------------------------------------
-   alpha      \p{PosixAlpha}       \p{Alpha}
-   alnum      \p{PosixAlnum}       \p{Alnum}
+   alpha      \p{PosixAlpha}       \p{XPosixAlpha}
+   alnum      \p{PosixAlnum}       \p{XPosixAlnum}
     ascii      \p{ASCII}          
-   blank      \p{PosixBlank}       \p{Blank} =             [1]
-                                   \p{HorizSpace}  \h      [1]
-   cntrl      \p{PosixCntrl}       \p{Cntrl}               [2]
-   digit      \p{PosixDigit}       \p{Digit}       \d
-   graph      \p{PosixGraph}       \p{Graph}               [3]
-   lower      \p{PosixLower}       \p{Lower}
-   print      \p{PosixPrint}       \p{Print}               [4]
-   punct      \p{PosixPunct}       \p{Punct}               [5]
-              \p{PerlSpace}        \p{SpacePerl}   \s      [6]
-   space      \p{PosixSpace}       \p{Space}               [6]
-   upper      \p{PosixUpper}       \p{Upper}
-   word       \p{PerlWord}         \p{Word}        \w
-   xdigit     \p{ASCII_Hex_Digit}  \p{XDigit}
+   blank      \p{PosixBlank}       \p{XPosixBlank}  \h      [1]
+                                   or \p{HorizSpace}        [1]
+   cntrl      \p{PosixCntrl}       \p{XPosixCntrl}          [2]
+   digit      \p{PosixDigit}       \p{XPosixDigit}  \d
+   graph      \p{PosixGraph}       \p{XPosixGraph}          [3]
+   lower      \p{PosixLower}       \p{XPosixLower}
+   print      \p{PosixPrint}       \p{XPosixPrint}          [4]
+   punct      \p{PosixPunct}       \p{XPosixPunct}          [5]
+              \p{PerlSpace}        \p{XPerlSpace}   \s      [6]
+   space      \p{PosixSpace}       \p{XPosixSpace}          [6]
+   upper      \p{PosixUpper}       \p{XPosixUpper}
+   word       \p{PosixWord}        \p{XPosixWord}   \w
+   xdigit     \p{ASCII_Hex_Digit}  \p{XPosixXDigit}
  
  =over 4
  
@@ -602,13 +603,15 @@ non-controls, non-alphanumeric, non-space characters:
  C<[-!"#$%&'()*+,./:;<=E<gt>?@[\\\]^_`{|}~]> (although if a locale is in effect,
  it could alter the behavior of C<[[:punct:]]>).
  
-C<\p{Punct}> matches a somewhat different set in the ASCII range, namely
+The similarly named property, C<\p{Punct}>, matches a somewhat different
+set in the ASCII range, namely
  C<[-!"#%&'()*,./:;?@[\\\]_{}]>.  That is, it is missing C<[$+E<lt>=E<gt>^`|~]>.
  This is because Unicode splits what POSIX considers to be punctuation into two
  categories, Punctuation and Symbols.
  
-When the matching string is in UTF-8 format, C<[[:punct:]]> matches what it
-matches in the ASCII range, plus what C<\p{Punct}> matches.  This is different
+C<\p{PosixPunct>, and when the matching string is in UTF-8 format,
+C<[[:punct:]]>, match what they match in the ASCII range, plus what
+C<\p{Punct}> matches.  This is different
  than strictly matching according to C<\p{Punct}>.  Another way to say it is that
  for a UTF-8 string, C<[[:punct:]]> matches all the characters that Unicode
  considers to be punctuation, plus all the ASCII-range characters that Unicode
@@ -621,6 +624,11 @@ matches the vertical tab, C<\cK>.   Same for the two ASCII-only range forms.
  
  =back
  
+There are various other synonyms that can be used for these besides
+C<\p{HorizSpace}> and \C<\p{XPosixBlank}>.  For example
+C<\p{PosixAlpha}> can be written as C<\p{Alpha}>.  All are listed
+in L<perluniprops/Properties accessible through \p{} and \P{}>.
+
  =head4 Negation
  X<character class, negation>
  
@@ -631,10 +639,12 @@ Some examples:
       POSIX         ASCII-range     Full-range  backslash
                      Unicode         Unicode    sequence
   -----------------------------------------------------
- [[:^digit:]]   \P{PosixDigit}     \P{Digit}      \D
- [[:^space:]]   \P{PosixSpace}     \P{Space}
-                \P{PerlSpace}      \P{SpacePerl}  \S
- [[:^word:]]    \P{PerlWord}       \P{Word}       \W
+ [[:^digit:]]   \P{PosixDigit}  \P{XPosixDigit}   \D
+ [[:^space:]]   \P{PosixSpace}  \P{XPosixSpace}
+                \P{PerlSpace}   \P{XPerlSpace}    \S
+ [[:^word:]]    \P{PerlWord}    \P{XPosixWord}    \W
+
+Again, the backslash sequence means Full-range Unicode.
  
  =head4 [= =] and [. .]
  
diff --git a/pod/perlreref.pod b/pod/perlreref.pod

index 6e028ee..4805c9b 100644 (file)
--- a/pod/perlreref.pod
+++ b/pod/perlreref.pod
@@ -145,44 +145,39 @@ and L<perlunicode> for details.
  
  POSIX character classes and their Unicode and Perl equivalents:
  
-           ASCII-         Full-
-           range          range   backslash
- POSIX    \p{...}         \p{}    sequence       Description
+            ASCII-         Full-
+   POSIX    range          range    backslash
+ [[:...:]]  \p{...}        \p{...}   sequence    Description
+
   -----------------------------------------------------------------------
- alnum   PosixAlnum       Alnum               Alpha plus Digit
- alpha   PosixAlpha       Alpha               Alphabetic characters
- ascii   ASCII                                Any ASCII character
- blank   PosixBlank       Blank     \h        Horizontal whitespace;
-                                                full-range also written
-                                                as \p{HorizSpace} (GNU
-                                                extension)
- cntrl   PosixCntrl       Cntrl               Control characters
- digit   PosixDigit       Digit     \d        Decimal digits
- graph   PosixGraph       Graph               Alnum plus Punct
- lower   PosixLower       Lower               Lowercase characters
- print   PosixPrint       Print               Graph plus Print, but not
-                                                any Cntrls
- punct   PosixPunct       Punct               These aren't precisely
-                                                equivalent.  See NOTE,
-                                                below.
- space   PosixSpace       Space     [\s\cK]   Whitespace
-         PerlSpace        SpacePerl \s        Perl's whitespace
-                                                definition
- upper   PosixUpper       Upper               Uppercase characters
- word    PerlWord         Word      \w        Alnum plus '_' (Perl
-                                                extension)
- xdigit  ASCII_Hex_Digit  XDigit              Hexadecimal digit,
-                                                ASCII-range is
-                                                [0-9A-Fa-f]
-
-NOTE on C<[[:punct:]]>, C<\p{PosixPunct}> and C<\p{Punct}>:
-In the ASCII range, C<[[:punct:]]> and C<\p{PosixPunct}> match
-C<[-!"#$%&'()*+,./:;<=E<gt>?@[\\\]^_`{|}~]> (although if a locale is in
-effect, it could alter the behavior of C<[[:punct:]]>); and C<\p{Punct}>
-matches C<[-!"#%&'()*,./:;?@[\\\]_{}]>.  When matching a UTF-8 string,
-C<[[:punct:]]> matches what it does in the ASCII range, plus what
-C<\p{Punct}> matches.  C<\p{Punct}> matches, anything that isn't a
-control, an alphanumeric, a space, nor a symbol.
+ alnum   PosixAlnum       XPosixAlnum            Alpha plus Digit
+ alpha   PosixAlpha       XPosixAlpha            Alphabetic characters
+ ascii   ASCII                                   Any ASCII character
+ blank   PosixBlank       XPosixBlank   \h       Horizontal whitespace;
+                                                   full-range also
+                                                   written as
+                                                   \p{HorizSpace} (GNU
+                                                   extension)
+ cntrl   PosixCntrl       XPosixCntrl            Control characters
+ digit   PosixDigit       XPosixDigit   \d       Decimal digits
+ graph   PosixGraph       XPosixGraph            Alnum plus Punct
+ lower   PosixLower       XPosixLower            Lowercase characters
+ print   PosixPrint       XPosixPrint            Graph plus Print, but
+                                                   not any Cntrls
+ punct   PosixPunct       XPosixPunct            Punctuation and Symbols
+                                                   in ASCII-range; just
+                                                   punct outside it
+ space   PosixSpace       XPosixSpace   [\s\cK]  Whitespace
+         PerlSpace        XPerlSpace    \s       Perl's whitespace def'n
+ upper   PosixUpper       XPosixUpper            Uppercase characters
+ word    PerlWord         XPosixWord    \w       Alnum + '_' (Perl
+                                                   extension)
+ xdigit  ASCII_Hex_Digit  XPosixDigit            Hexadecimal digit,
+                                                    ASCII-range is
+                                                    [0-9A-Fa-f]
+
+Also, various synonyms like C<\p{Alpha}> for C<\p{XPosixAlpha}>; all listed
+in L<perluniprops/Properties accessible through \p{} and \P{}>
  
  Within a character class:
author	Karl Williamson <public@khwilliamson.com>
	Sat, 30 Oct 2010 16:13:48 +0000 (10:13 -0600)
committer	Father Chrysostomos <sprout@cpan.org>
	Sun, 31 Oct 2010 19:21:05 +0000 (12:21 -0700)
lib/unicore/mktables		patch \| blob \| history
pod/perlrecharclass.pod		patch \| blob \| history
pod/perlreref.pod		patch \| blob \| history