Unicode categories continue:

author Jarkko Hietaniemi <jhi@iki.fi>

Fri, 19 Oct 2001 03:25:44 +0000 (03:25 +0000)

committer Jarkko Hietaniemi <jhi@iki.fi>

Fri, 19 Oct 2001 03:25:44 +0000 (03:25 +0000)
author Jarkko Hietaniemi <jhi@iki.fi>
Fri, 19 Oct 2001 03:25:44 +0000 (03:25 +0000)
committer Jarkko Hietaniemi <jhi@iki.fi>
Fri, 19 Oct 2001 03:25:44 +0000 (03:25 +0000)
diff --git a/lib/unicore/Blocks.pl b/lib/unicore/Blocks.pl

index b58ff0d..c1b2617 100644 (file)
--- a/lib/unicore/Blocks.pl
+++ b/lib/unicore/Blocks.pl
@@ -90,7 +90,7 @@ FE70  FEFE    Arabic Presentation Forms-B
  FEFF           Specials
  FF00   FFEF    Halfwidth and Fullwidth Forms
  FFF0   FFFD    Specials
-10300  1032F   Old Italic
+10300  1032F   Old Italic Block
  10330  1034F   Gothic Block
  10400  1044F   Deseret Block
  1D000  1D0FF   Byzantine Musical Symbols
diff --git a/lib/unicore/In.pl b/lib/unicore/In.pl

index 8d59516..8e3cdf5 100644 (file)
--- a/lib/unicore/In.pl
+++ b/lib/unicore/In.pl
@@ -133,7 +133,7 @@
  'Number Forms'                                => '95',
  'OGHAM'                                       => '38',
  'Ogham Block'                                 => '84',
-'Old Italic'                                  => '137',
+'Old Italic Block'                            => '137',
  'OLD-ITALIC'                                  => '47',
  'Optical Character Recognition'               => '100',
  'ORIYA'                                       => '23',
@@ -405,7 +405,7 @@
         'Ogham(?:[-_]|\s+)?Block' => '84',
  },
  'ol' => {
-       'Old(?:[-_]|\s+)?Italic' => '137',
+       'Old(?:[-_]|\s+)?Italic(?:[-_]|\s+)?Block' => '137',
         'OLD(?:[-_]|\s+)?ITALIC' => '47',
  },
  'op' => {
@@ -489,3 +489,146 @@
         'Yi(?:[-_]|\s+)?Syllables' => '122',
  },
  );
+
+%utf8::InScript =
+(
+  10 => 'LATIN',
+  11 => 'GREEK',
+  12 => 'INHERITED',
+  13 => 'CYRILLIC',
+  14 => 'ARMENIAN',
+  15 => 'HEBREW',
+  16 => 'ARABIC',
+  17 => 'SYRIAC',
+  18 => 'THAANA',
+  19 => 'DEVANAGARI',
+  20 => 'BENGALI',
+  21 => 'GURMUKHI',
+  22 => 'GUJARATI',
+  23 => 'ORIYA',
+  24 => 'TAMIL',
+  25 => 'TELUGU',
+  26 => 'KANNADA',
+  27 => 'MALAYALAM',
+  28 => 'SINHALA',
+  29 => 'THAI',
+  30 => 'LAO',
+  31 => 'TIBETAN',
+  32 => 'MYANMAR',
+  33 => 'GEORGIAN',
+  34 => 'HANGUL',
+  35 => 'ETHIOPIC',
+  36 => 'CHEROKEE',
+  37 => 'CANADIAN-ABORIGINAL',
+  38 => 'OGHAM',
+  39 => 'RUNIC',
+  40 => 'KHMER',
+  41 => 'MONGOLIAN',
+  42 => 'HAN',
+  43 => 'HIRAGANA',
+  44 => 'KATAKANA',
+  45 => 'BOPOMOFO',
+  46 => 'YI',
+  47 => 'OLD-ITALIC',
+  48 => 'GOTHIC',
+  49 => 'DESERET',
+);
+
+%utf8::InBlock =
+(
+  51 => 'Basic Latin',
+  52 => 'Latin-1 Supplement',
+  53 => 'Latin Extended-A',
+  54 => 'Latin Extended-B',
+  55 => 'IPA Extensions',
+  56 => 'Spacing Modifier Letters',
+  57 => 'Combining Diacritical Marks',
+  58 => 'Greek',
+  59 => 'Cyrillic',
+  60 => 'Armenian',
+  61 => 'Hebrew',
+  62 => 'Arabic',
+  63 => 'Syriac',
+  64 => 'Thaana',
+  65 => 'Devanagari',
+  66 => 'Bengali',
+  67 => 'Gurmukhi',
+  68 => 'Gujarati',
+  69 => 'Oriya',
+  70 => 'Tamil',
+  71 => 'Telugu',
+  72 => 'Kannada',
+  73 => 'Malayalam',
+  74 => 'Sinhala',
+  75 => 'Thai',
+  76 => 'Lao',
+  77 => 'Tibetan',
+  78 => 'Myanmar',
+  79 => 'Georgian',
+  80 => 'Hangul Jamo',
+  81 => 'Ethiopic',
+  82 => 'Cherokee',
+  83 => 'Unified Canadian Aboriginal Syllabics',
+  84 => 'Ogham',
+  85 => 'Runic',
+  86 => 'Khmer',
+  87 => 'Mongolian',
+  88 => 'Latin Extended Additional',
+  89 => 'Greek Extended',
+  90 => 'General Punctuation',
+  91 => 'Superscripts and Subscripts',
+  92 => 'Currency Symbols',
+  93 => 'Combining Marks for Symbols',
+  94 => 'Letterlike Symbols',
+  95 => 'Number Forms',
+  96 => 'Arrows',
+  97 => 'Mathematical Operators',
+  98 => 'Miscellaneous Technical',
+  99 => 'Control Pictures',
+ 100 => 'Optical Character Recognition',
+ 101 => 'Enclosed Alphanumerics',
+ 102 => 'Box Drawing',
+ 103 => 'Block Elements',
+ 104 => 'Geometric Shapes',
+ 105 => 'Miscellaneous Symbols',
+ 106 => 'Dingbats',
+ 107 => 'Braille Patterns',
+ 108 => 'CJK Radicals Supplement',
+ 109 => 'Kangxi Radicals',
+ 110 => 'Ideographic Description Characters',
+ 111 => 'CJK Symbols and Punctuation',
+ 112 => 'Hiragana',
+ 113 => 'Katakana',
+ 114 => 'Bopomofo',
+ 115 => 'Hangul Compatibility Jamo',
+ 116 => 'Kanbun',
+ 117 => 'Bopomofo Extended',
+ 118 => 'Enclosed CJK Letters and Months',
+ 119 => 'CJK Compatibility',
+ 120 => 'CJK Unified Ideographs Extension A',
+ 121 => 'CJK Unified Ideographs',
+ 122 => 'Yi Syllables',
+ 123 => 'Yi Radicals',
+ 124 => 'Hangul Syllables',
+ 125 => 'High Surrogates',
+ 126 => 'High Private Use Surrogates',
+ 127 => 'Low Surrogates',
+ 128 => 'CJK Compatibility Ideographs',
+ 129 => 'Alphabetic Presentation Forms',
+ 130 => 'Arabic Presentation Forms-A',
+ 131 => 'Combining Half Marks',
+ 132 => 'CJK Compatibility Forms',
+ 133 => 'Small Form Variants',
+ 134 => 'Arabic Presentation Forms-B',
+ 135 => 'Specials',
+ 136 => 'Halfwidth and Fullwidth Forms',
+ 137 => 'Old Italic',
+ 138 => 'Gothic',
+ 139 => 'Deseret',
+ 140 => 'Byzantine Musical Symbols',
+ 141 => 'Musical Symbols',
+ 142 => 'Mathematical Alphanumeric Symbols',
+ 143 => 'CJK Unified Ideographs Extension B',
+ 144 => 'CJK Compatibility Ideographs Supplement',
+ 145 => 'Tags',
+);
diff --git a/lib/unicore/In/137.pl b/lib/unicore/In/137.pl

index 7161573..6be2e0d 100644 (file)
--- a/lib/unicore/In/137.pl
+++ b/lib/unicore/In/137.pl
@@ -2,5 +2,5 @@
  # This file is built by mktables from e.g. Unicode.txt.
  # Any changes made here will be lost!
  return <<'END';
-10300  1032F   Old Italic
+10300  1032F   Old Italic Block
  END
diff --git a/lib/unicore/mktables b/lib/unicore/mktables

index 060a0e6..3328f69 100644 (file)
--- a/lib/unicore/mktables
+++ b/lib/unicore/mktables
@@ -103,6 +103,9 @@ my %In;
  my $InId = 0;
  my %InIn;
  
+my %InScript;
+my %InBlock;
+
  #
  # Read in the Unicode.txt, the main Unicode database.
  #
@@ -355,8 +358,9 @@ for my $script (sort { $a->[0] <=> $b->[0] } @Scripts) {
         extend($Script{$name}, $last);
      }
      unless (defined $In{$name}) {
-       $In{$name}   = $InId++;
-       $InIn{$name} = $Script{$name};
+       $InScript{$InId} = $name;
+       $In{$name}       = $InId++;
+       $InIn{$name}     = $Script{$name};
      }
  }
  
@@ -382,11 +386,19 @@ if (open(my $Blocks, "Blocks.txt")) {
         next unless /^([0-9A-Fa-f]+)\.\.([0-9A-Fa-f]+)\s*;\s*(.+?)\s*$/;
         
         my ($first, $last, $name) = ($1, $2, $3);
+       my $origname = $name;
  
         # If there's a naming conflict (the script names are
         # in uppercase), the name of the block has " Block"
         # appended to it.
-       $name = "$name Block" if defined $In{"\U$name"};
+       my $pat = $name;
+       $pat =~ s/([- _])/(?:[-_]|\\s+)?/g;
+       for my $i (values %InScript) {
+           if ($i =~ /^$pat$/i) {
+               $name .= " Block";
+               last;
+           }
+       }
  
         append(\@Blocks,              $first, $name);
         append($Blocks{$name} ||= [], $first, $name);
@@ -395,8 +407,9 @@ if (open(my $Blocks, "Blocks.txt")) {
             extend($Blocks{$name}, $last);
         }
         unless (defined $In{$name}) {
-           $In{$name}   = $InId++;
-           $InIn{$name} = $Blocks{$name};
+           $InBlock{$InId} = $origname;
+           $In{$name}      = $InId++;
+           $InIn{$name}    = $Blocks{$name};
         }
      }
  } else {
@@ -591,6 +604,39 @@ EOT
  
  mapping(\%In, "In");
  
+#
+# Append the InScript and InBlock mappings.
+# These are needed only if Script= and Block= syntaxes are used.
+#
+
+if (open(my $In, ">>In.pl")) {
+    print $In <<EOT;
+
+%utf8::InScript =
+(
+EOT
+    for my $i (sort { $a <=> $b } keys %InScript) {
+       printf $In "%4d => '$InScript{$i}',\n", $i;
+    }
+    print $In <<EOT;
+);
+EOT
+
+    print $In <<EOT;
+
+%utf8::InBlock =
+(
+EOT
+    for my $i (sort { $a <=> $b } keys %InBlock) {
+       printf $In "%4d => '$InBlock{$i}',\n", $i;
+    }
+    print $In <<EOT;
+);
+EOT
+} else {
+    die "$0: In.pl: $!\n";
+}
+
  # Easy low-calorie cheat.
  use File::Copy;
  copy("In/$In{Noncharacter_Code_Point}.pl", "Is/Cn.pl");
diff --git a/lib/utf8_heavy.pl b/lib/utf8_heavy.pl

index fe286d9..0cc71f4 100644 (file)
--- a/lib/utf8_heavy.pl
+++ b/lib/utf8_heavy.pl
@@ -27,7 +27,7 @@ sub SWASHNEW {
  
         unless (defined $file) {
             defined %utf8::Is || do "unicore/Is.pl";
-           if ($type =~ /^(?:Is)?[- _]?([A-Z].*)$/i) {
+           if ($type =~ /^(?:Is|Category\s*=\s*)?[- _]?([A-Z].*)$/i) {
                 my $istype = $1;
                 print "istype = $istype\n" if DEBUG;
                 unless ($list = do "unicore/Is/$istype.pl") {
@@ -55,10 +55,11 @@ sub SWASHNEW {
             unless (defined $file) {
                 defined %utf8::In || do "unicore/In.pl";
                 $type = 'Lampersand' if $type =~ /^(?:Is)?L&$/;
-               if ($type =~ /^(?:In)?[- _]?(?!herited$)(.+)/i) {
-                   my $intype = $1;
-                   print "intype = $intype\n" if DEBUG;
-                   if (exists $utf8::Is{$istype}) {
+               if ($type =~ /^(In|(?:Script|Block)\s*=\s*)?[- _]?(?!herited$)(.+)/i) {
+                   my $incat  = $1;
+                   my $intype = $2;
+                   print "incat = $incat, intype = $intype\n" if DEBUG;
+                   if (exists $utf8::In{$intype}) {
                         $file = "unicore/In/$utf8::In{$intype}";
                     } else {
                         my $inprefix = substr(lc($intype), 0, 2);
@@ -69,7 +70,13 @@ sub SWASHNEW {
                             for my $k (keys %{$utf8::InPat{$inprefix}}) {
                                 print "inprefix = $inprefix, In = $In, k = $k\n" if DEBUG;
                                 if ($In =~ /^$k$/i) {
-                                   $file = "unicore/In/$utf8::InPat{$inprefix}->{$k}";
+                                   my $i = $utf8::InPat{$inprefix}->{$k};
+                                   print "inprefix = $inprefix, In = $In, k = $k, i = $i\n" if DEBUG;
+                                   next if $incat =~ /^S/ &&
+                                           !exists $utf8::InScript{$i};
+                                   next if $incat =~ /^B/ &&
+                                           !exists $utf8::InBlock{$i};
+                                   $file = "unicore/In/$i";
                                     print "inprefix = $inprefix, In = $In, k = $k, file = $file\n" if DEBUG;
                                     last;
                                 }
diff --git a/pod/perltodo.pod b/pod/perltodo.pod

index b903593..5fae97a 100644 (file)
--- a/pod/perltodo.pod
+++ b/pod/perltodo.pod
@@ -77,12 +77,8 @@ Allow for the metaproperties: C<XID Start>, C<XID Continue>,
  C<NF*_NO>, C<NF*_MAYBE> (require the DerivedCoreProperties and
  DerviceNormalizationProperties files).
  
-There are also enumerated properties: C<Decomposition Type>,
-C<Numeric Type>, C<East Asian Width>, C<Line Break>.  These
-properties have multiple values: for uniqueness the property
-value should be appended.  For example, C<\p{IsAlphabetic}>
-wouldbe the binary property, while C<\p{AlphabeticLineBreak}>
-would mean the enumerated property.
+There are also multiple value properties still unimplemented:
+C<Numeric Type>, C<East Asian Width>.
  
  =item *
  
diff --git a/pod/perlunicode.pod b/pod/perlunicode.pod

index 9e3ca75..6bd0423 100644 (file)
--- a/pod/perlunicode.pod
+++ b/pod/perlunicode.pod
@@ -311,7 +311,7 @@ The scripts available for C<\p{In...}> and C<\P{In...}>, for example
      Hangul
      Ethiopic
      Cherokee
-    CanadianAboriginal
+    Canadian Aboriginal
      Ogham
      Runic
      Khmer
@@ -321,7 +321,7 @@ The scripts available for C<\p{In...}> and C<\P{In...}>, for example
      Bopomofo
      Han
      Yi
-    OldItalic
+    Old Italic
      Gothic
      Deseret
      Inherited
@@ -386,101 +386,101 @@ preferential Unicode character class definition; this meant that
  the definitions of some character classes changed (the ones in the
  below list that have the C<Block> appended).
  
-   BasicLatin
-   Latin1Supplement
-   LatinExtendedA
-   LatinExtendedB
-   IPAExtensions
-   SpacingModifierLetters
-   CombiningDiacriticalMarks
-   GreekBlock
-   CyrillicBlock
-   ArmenianBlock
-   HebrewBlock
-   ArabicBlock
-   SyriacBlock
-   ThaanaBlock
-   DevanagariBlock
-   BengaliBlock
-   GurmukhiBlock
-   GujaratiBlock
-   OriyaBlock
-   TamilBlock
-   TeluguBlock
-   KannadaBlock
-   MalayalamBlock
-   SinhalaBlock
-   ThaiBlock
-   LaoBlock
-   TibetanBlock
-   MyanmarBlock
-   GeorgianBlock
-   HangulJamo
-   EthiopicBlock
-   CherokeeBlock
-   UnifiedCanadianAboriginalSyllabics
-   OghamBlock
-   RunicBlock
-   KhmerBlock
-   MongolianBlock
-   LatinExtendedAdditional
-   GreekExtended
-   GeneralPunctuation
-   SuperscriptsandSubscripts
-   CurrencySymbols
-   CombiningMarksforSymbols
-   LetterlikeSymbols
-   NumberForms
+   Basic Latin
+   Latin 1 Supplement
+   Latin Extended-A
+   Latin Extended-B
+   IPA Extensions
+   Spacing Modifier Letters
+   Combining Diacritical Marks
+   Greek Block
+   Cyrillic Block
+   Armenian Block
+   Hebrew Block
+   Arabic Block
+   Syriac Block
+   Thaana Block
+   Devanagari Block
+   Bengali Block
+   Gurmukhi Block
+   Gujarati Block
+   Oriya Block
+   Tamil Block
+   Telugu Block
+   Kannada Block
+   Malayalam Block
+   Sinhala Block
+   Thai Block
+   Lao Block
+   Tibetan Block
+   Myanmar Block
+   Georgian Block
+   Hangul Jamo
+   Ethiopic Block
+   Cherokee Block
+   Unified Canadian Aboriginal Syllabics
+   Ogham Block
+   Runic Block
+   Khmer Block
+   Mongolian Block
+   Latin Extended Additional
+   Greek Extended
+   General Punctuation
+   Superscripts and Subscripts
+   Currency Symbols
+   Combining Marks for Symbols
+   Letterlike Symbols
+   Number Forms
     Arrows
-   MathematicalOperators
-   MiscellaneousTechnical
-   ControlPictures
-   OpticalCharacterRecognition
-   EnclosedAlphanumerics
-   BoxDrawing
-   BlockElements
-   GeometricShapes
-   MiscellaneousSymbols
+   Mathematical Operators
+   Miscellaneous Technical
+   Control Pictures
+   Optical Character Recognition
+   Enclosed Alphanumerics
+   Box Drawing
+   Block Elements
+   Geometric Shapes
+   Miscellaneous Symbols
     Dingbats
-   BraillePatterns
-   CJKRadicalsSupplement
-   KangxiRadicals
-   IdeographicDescriptionCharacters
-   CJKSymbolsandPunctuation
-   HiraganaBlock
-   KatakanaBlock
-   BopomofoBlock
-   HangulCompatibilityJamo
+   Braille Patterns
+   CJK Radicals Supplement
+   Kangxi Radicals
+   Ideographic Description Characters
+   CJK Symbols and Punctuation
+   Hiragana Block
+   Katakana Block
+   Bopomofo Block
+   Hangul Compatibility Jamo
     Kanbun
-   BopomofoExtended
-   EnclosedCJKLettersandMonths
-   CJKCompatibility
-   CJKUnifiedIdeographsExtensionA
-   CJKUnifiedIdeographs
-   YiSyllables
-   YiRadicals
-   HangulSyllables
-   HighSurrogates
-   HighPrivateUseSurrogates
-   LowSurrogates
-   PrivateUse
-   CJKCompatibilityIdeographs
-   AlphabeticPresentationForms
-   ArabicPresentationFormsA
-   CombiningHalfMarks
-   CJKCompatibilityForms
-   SmallFormVariants
-   ArabicPresentationFormsB
+   Bopomofo Extended
+   Enclosed CJK Letters and Months
+   CJK Compatibility
+   CJK Unified Ideographs Extension A
+   CJK Unified Ideographs
+   Yi Syllables
+   Yi Radicals
+   Hangul Syllables
+   High Surrogates
+   High Private Use Surrogates
+   Low Surrogates
+   Private Use
+   CJK Compatibility Ideographs
+   Alphabetic Presentation Forms
+   Arabic Presentation Forms-A
+   Combining Half Marks
+   CJK Compatibility Forms
+   Small Form Variants
+   Arabic Presentation Forms-B
     Specials
-   HalfwidthandFullwidthForms
-   OldItalicBlock
-   GothicBlock
-   DeseretBlock
-   ByzantineMusicalSymbols
-   MusicalSymbols
-   MathematicalAlphanumericSymbols
-   CJKUnifiedIdeographsExtensionB
-   CJKCompatibilityIdeographsSupplement
+   Halfwidth and Fullwidth Forms
+   Old Italic Block
+   Gothic Block
+   Deseret Block
+   Byzantine Musical Symbols
+   Musical Symbols
+   Mathematical Alphanumeric Symbols
+   CJK Unified Ideographs Extension B
+   CJK Compatibility Ideographs Supplement
     Tags
  
  =item *
diff --git a/t/op/pat.t b/t/op/pat.t

index 0f978d1..6617921 100755 (executable)
--- a/t/op/pat.t
+++ b/t/op/pat.t
@@ -6,7 +6,7 @@
  
  $| = 1;
  
-print "1..747\n";
+print "1..750\n";
  
  BEGIN {
      chdir 't' if -d 't';
@@ -2243,3 +2243,15 @@ print "# some Unicode properties\n";
      print "not " unless "\x{AC00}" =~ /\p{HangulSyllable}/;
      print "ok 747\n";
  }
+
+{
+    print "not " unless "\x{0100}" =~ /\p{Script=Latin}/;
+    print "ok 748\n";
+
+    print "not " unless "\x{0100}" =~ /\p{Block=LatinExtendedA}/;
+    print "ok 749\n";
+
+    print "not " unless "\x{0100}" =~ /\p{Category=UppercaseLetter}/;
+    print "ok 750\n";
+}
+
author	Jarkko Hietaniemi <jhi@iki.fi>
	Fri, 19 Oct 2001 03:25:44 +0000 (03:25 +0000)
committer	Jarkko Hietaniemi <jhi@iki.fi>
	Fri, 19 Oct 2001 03:25:44 +0000 (03:25 +0000)
lib/unicore/Blocks.pl		patch \| blob \| history
lib/unicore/In.pl		patch \| blob \| history
lib/unicore/In/137.pl		patch \| blob \| history
lib/unicore/mktables		patch \| blob \| history
lib/utf8_heavy.pl		patch \| blob \| history
pod/perltodo.pod		patch \| blob \| history
pod/perlunicode.pod		patch \| blob \| history
t/op/pat.t		patch \| blob \| history