Add compexcl(), casefold(), and casespec() interfaces;

author Jarkko Hietaniemi <jhi@iki.fi>

Tue, 10 Jul 2001 15:00:37 +0000 (15:00 +0000)

committer Jarkko Hietaniemi <jhi@iki.fi>

Tue, 10 Jul 2001 15:00:37 +0000 (15:00 +0000)
author Jarkko Hietaniemi <jhi@iki.fi>
Tue, 10 Jul 2001 15:00:37 +0000 (15:00 +0000)
committer Jarkko Hietaniemi <jhi@iki.fi>
Tue, 10 Jul 2001 15:00:37 +0000 (15:00 +0000)
diff --git a/lib/Unicode/UCD.pm b/lib/Unicode/UCD.pm

index ff819cde1a14084fca86903307fcdf29b21f9881..cc5d192227b4995dc1ca3d2e220384849e62611d 100644 (file)
--- a/lib/Unicode/UCD.pm
+++ b/lib/Unicode/UCD.pm
@@ -11,7 +11,9 @@ our @ISA = qw(Exporter);
  our @EXPORT_OK = qw(charinfo
                     charblock charscript
                     charblocks charscripts
-                   charinrange);
+                   charinrange
+                   compexcl
+                   casefold casespec);
  
  use Carp;
  
@@ -22,7 +24,7 @@ Unicode::UCD - Unicode character database
  =head1 SYNOPSIS
  
      use Unicode::UCD 'charinfo';
-    my %charinfo   = charinfo($codepoint);
+    my $charinfo   = charinfo($codepoint);
  
      use Unicode::UCD 'charblock';
      my $charblock  = charblock($codepoint);
@@ -41,6 +43,9 @@ my $UNICODEFH;
  my $BLOCKSFH;
  my $SCRIPTSFH;
  my $VERSIONFH;
+my $COMPEXCLFH;
+my $CASEFOLDFH;
+my $CASESPECFH;
  
  sub openunicode {
      my ($rfh, @path) = @_;
@@ -63,10 +68,10 @@ sub openunicode {
  
      use Unicode::UCD 'charinfo';
  
-    my %charinfo = charinfo(0x41);
+    my $charinfo = charinfo(0x41);
  
-charinfo() returns a hash that has the following fields as defined
-by the Unicode standard:
+charinfo() returns a reference to a hash that has the following fields
+as defined by the Unicode standard:
  
      key
  
@@ -89,7 +94,7 @@ by the Unicode standard:
      block            block the character belongs to (used in \p{In...})
      script           script the character belongs to 
  
-If no match is found, an empty hash is returned.
+If no match is found, a reference to an empty hash is returned.
  
  The C<block> property is the same as as returned by charinfo().  It is
  not defined in the Unicode Character Database proper (Chapter 4 of the
@@ -98,9 +103,7 @@ of TUS3).  Similarly for the C<script> property.
  
  Note that you cannot do (de)composition and casing based solely on the
  above C<decomposition> and C<lower>, C<upper>, C<title>, properties,
-you will need also the I<Composition Exclusions>, I<Case Folding>, and
-I<SpecialCasing> tables, available as files F<CompExcl.txt>,
-F<CaseFold.txt>, and F<SpecCase.txt> in the Perl distribution.
+you will need also the compexcl(), casefold(), and casespec() functions.
  
  =cut
  
@@ -140,7 +143,7 @@ sub charinfo {
             if ($prop{code} eq $hexk) {
                 $prop{block}  = charblock($code);
                 $prop{script} = charscript($code);
-               return %prop;
+               return \%prop;
             }
         }
      }
@@ -188,9 +191,7 @@ sub charinrange {
  
  With a B<code point argument> charblock() returns the block the character
  belongs to, e.g.  C<Basic Latin>.  Note that not all the character
-positions within all blocks are defined.  A <code point argument>
-is either a decimal or a hexadecimal scalar, or "U+" followed
-by hexadecimals.
+positions within all blocks are defined.
  
  If supplied with an argument that can't be a code point, charblock()
  tries to do the opposite and interpret the argument as a character
@@ -250,9 +251,7 @@ sub charblock {
      my $ranges     = charscript('Thai');
  
  With a B<code point argument> charscript() returns the script the
-character belongs to, e.g.  C<Latin>, C<Greek>, C<Han>.  A <code point
-argument> is either a decimal or a hexadecimal scalar, or "U+"
-followed by hexadecimals.
+character belongs to, e.g.  C<Latin>, C<Greek>, C<Han>.
  
  If supplied with an argument that can't be a code point, charscript()
  tries to do the opposite and interpret the argument as a character
@@ -308,16 +307,16 @@ sub charscript {
  
      use Unicode::UCD 'charblocks';
  
-    my %charblocks = charblocks();
+    my $charblocks = charblocks();
  
-charblocks() returns a hash with the known block names as the keys,
-and the code point ranges (see L</charblock>) as the values.
+charblocks() returns a reference to a hash with the known block names
+as the keys, and the code point ranges (see L</charblock>) as the values.
  
  =cut
  
  sub charblocks {
-    _charblocks() unless @BLOCKS;
-    return %BLOCKS;
+    _charblocks() unless %BLOCKS;
+    return \%BLOCKS;
  }
  
  =head2 charscripts
@@ -332,8 +331,8 @@ and the code point ranges (see L</charscript>) as the values.
  =cut
  
  sub charscripts {
-    _charscripts() unless @SCRIPTS;
-    return %SCRIPTS;
+    _charscripts() unless %SCRIPTS;
+    return \%SCRIPTS;
  }
  
  =head2 Blocks versus Scripts
@@ -368,6 +367,11 @@ There are a few cases where there exists both a script and a block by
  the same name, in these cases the block version has C<Block> appended:
  C<\p{InKatakana}> is the script, C<\p{InKatakanaBlock}> is the block.
  
+=head2 Code Point Arguments
+
+A <code point argument> is either a decimal or a hexadecimal scalar,
+or "U+" followed by hexadecimals.
+
  =head2 charinrange
  
  In addition to using the C<\p{In...}> and C<\P{In...}> constructs, you
@@ -382,6 +386,191 @@ by L</charblocks> and </charscripts> by using charinrange():
  
  =cut
  
+=head2 compexcl
+
+    use Unicode::UCD 'compexcl';
+
+    my $compexcl = compexcl("09dc");
+
+The compexcl() returns the composition exclusion (that is, if the
+character cannot be decomposed) of the character specified by a B<code
+point argument>.
+
+If there is a composition exclusion for the character, true is
+returned.  Otherwise, false is returned.
+
+=cut
+
+my %COMPEXCL;
+
+sub _compexcl {
+    unless (%COMPEXCL) {
+       if (openunicode(\$COMPEXCLFH, "CompExcl.txt")) {
+           while (<$COMPEXCLFH>) {
+               if (/^([0-9A-F]+) \# /) {
+                   my $code = hex($1);
+                   $COMPEXCL{$code} = undef;
+               }
+           }
+           close($COMPEXCLFH);
+       }
+    }
+}
+
+sub compexcl {
+    my $arg  = shift;
+    my $code = _getcode($arg);
+
+    _compexcl() unless %COMPEXCL;
+
+    return exists $COMPEXCL{$code};
+}
+
+=head2 casefold
+
+    use Unicode::UCD 'casefold';
+
+    my %casefold = casefold("09dc");
+
+The casefold() returns the locale-independent case folding of the
+character specified by a B<code point argument>.
+
+If there is a case folding for that character, a reference to a hash
+with the following fields is returned:
+
+    key
+
+    code             code point with at least four hexdigits
+    status           "C", "F", "S", or "I"
+    mapping          one or more codes separated by spaces
+
+The meaning of the I<status> is as follows:
+
+   C                 common case folding, common mappings shared
+                     by both simple and full mappings
+   F                 full case folding, mappings that cause strings
+                     to grow in length. Multiple characters are separated
+                     by spaces
+   S                 simple case folding, mappings to single characters
+                     where different from F
+   I                 special case for dotted uppercase I and
+                     dotless lowercase i
+                     - If this mapping is included, the result is
+                       case-insensitive, but dotless and dotted I's
+                       are not distinguished
+                     - If this mapping is excluded, the result is not
+                       fully case-insensitive, but dotless and dotted
+                       I's are distinguished
+
+If there is no case folding for that character, C<undef> is returned.
+
+For more information about case mappings see
+http://www.unicode.org/unicode/reports/tr21/
+
+=cut
+
+my %CASEFOLD;
+
+sub _casefold {
+    unless (%CASEFOLD) {
+       if (openunicode(\$CASEFOLDFH, "CaseFold.txt")) {
+           while (<$CASEFOLDFH>) {
+               if (/^([0-9A-F]+); ([CFSI]); ([0-9A-F]+(?: [0-9A-F]+)*);/) {
+                   my $code = hex($1);
+                   $CASEFOLD{$code} = { code    => $1,
+                                        status  => $2,
+                                        mapping => $3 };
+               }
+           }
+           close($CASEFOLDFH);
+       }
+    }
+}
+
+sub casefold {
+    my $arg  = shift;
+    my $code = _getcode($arg);
+
+    _casefold() unless %CASEFOLD;
+
+    return $CASEFOLD{$code};
+}
+
+=head2 casespec
+
+    use Unicode::UCD 'casespec';
+
+    my %casespec = casespec("09dc");
+
+The casespec() returns the potentially locale-dependent case mapping
+of the character specified by a B<code point argument>.  The mapping
+may change the length of the string (which the basic Unicode case
+mappings as returned by charinfo() never do).
+
+If there is a case folding for that character, a reference to a hash
+with the following fields is returned:
+
+    key
+
+    code             code point with at least four hexdigits
+    lower            lowercase
+    title            titlecase
+    upper            uppercase
+    condition        condition list (may be undef)
+
+The C<condition> is optional.  Where present, it consists of one or
+more I<locales> or I<contexts>, separated by spaces (other than as
+used to separate elements, spaces are to be ignored).  A condition
+list overrides the normal behavior if all of the listed conditions are
+true.  Case distinctions in the condition list are not significant.
+Conditions preceded by "NON_" represent the negation of the condition
+
+A I<locale> is defined as a 2-letter ISO 3166 country code, possibly
+followed by a "_" and a 2-letter ISO language code (, possibly followed
+by a "_" and a variant code).  You can find the list of those codes
+in L<Locale::Country> and L<Locale::Language>.
+
+A I<context> is one of the following choices:
+
+    FINAL            The letter is not followed by a letter of
+                     general category L (e.g. Ll, Lt, Lu, Lm, or Lo)
+    MODERN           The mapping is only used for modern text
+    AFTER_i          The last base character was "i" 0069
+
+For more information about case mappings see
+http://www.unicode.org/unicode/reports/tr21/
+
+=cut
+
+my %CASESPEC;
+
+sub _casespec {
+    unless (%CASESPEC) {
+       if (openunicode(\$CASESPECFH, "SpecCase.txt")) {
+           while (<$CASESPECFH>) {
+               if (/^([0-9A-F]+); ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; (\w+(?: \w+)*)?/) {
+                   my $code = hex($1);
+                   $CASESPEC{$code} = { code      => $1,
+                                        lower     => $2,
+                                        title     => $3,
+                                        upper     => $4,
+                                        condition => $5 };
+               }
+           }
+           close($CASESPECFH);
+       }
+    }
+}
+
+sub casespec {
+    my $arg  = shift;
+    my $code = _getcode($arg);
+
+    _casespec() unless %CASESPEC;
+
+    return $CASESPEC{$code};
+}
+
  =head2 Unicode::UCD::UnicodeVersion
  
  Unicode::UCD::UnicodeVersion() returns the version of the Unicode Character
diff --git a/lib/Unicode/UCD.t b/lib/Unicode/UCD.t

index 3cd11925b1534632bf220a55b238e0acf866b834..6ebea8a1a63678d5b1641ebbce1b9aa383e396a0 100644 (file)
--- a/lib/Unicode/UCD.t
+++ b/lib/Unicode/UCD.t
@@ -3,95 +3,95 @@ use Unicode::UCD;
  use Test;
  use strict;
  
-BEGIN { plan tests => 103 };
+BEGIN { plan tests => 111 };
  
  use Unicode::UCD 'charinfo';
  
-my %charinfo;
-
-%charinfo = charinfo(0x41);
-
-ok($charinfo{code},           '0041');
-ok($charinfo{name},           'LATIN CAPITAL LETTER A');
-ok($charinfo{category},       'Lu');
-ok($charinfo{combining},      '0');
-ok($charinfo{bidi},           'L');
-ok($charinfo{decomposition},  '');
-ok($charinfo{decimal},        '');
-ok($charinfo{digit},          '');
-ok($charinfo{numeric},        '');
-ok($charinfo{mirrored},       'N');
-ok($charinfo{unicode10},      '');
-ok($charinfo{comment},        '');
-ok($charinfo{upper},          '');
-ok($charinfo{lower},          '0061');
-ok($charinfo{title},          '');
-ok($charinfo{block},          'Basic Latin');
-ok($charinfo{script},         'Latin');
-
-%charinfo = charinfo(0x100);
-
-ok($charinfo{code},           '0100');
-ok($charinfo{name},           'LATIN CAPITAL LETTER A WITH MACRON');
-ok($charinfo{category},       'Lu');
-ok($charinfo{combining},      '0');
-ok($charinfo{bidi},           'L');
-ok($charinfo{decomposition},  '0041 0304');
-ok($charinfo{decimal},        '');
-ok($charinfo{digit},          '');
-ok($charinfo{numeric},        '');
-ok($charinfo{mirrored},       'N');
-ok($charinfo{unicode10},      'LATIN CAPITAL LETTER A MACRON');
-ok($charinfo{comment},        '');
-ok($charinfo{upper},          '');
-ok($charinfo{lower},          '0101');
-ok($charinfo{title},          '');
-ok($charinfo{block},          'Latin Extended-A');
-ok($charinfo{script},         'Latin');
+my $charinfo;
+
+$charinfo = charinfo(0x41);
+
+ok($charinfo->{code},           '0041');
+ok($charinfo->{name},           'LATIN CAPITAL LETTER A');
+ok($charinfo->{category},       'Lu');
+ok($charinfo->{combining},      '0');
+ok($charinfo->{bidi},           'L');
+ok($charinfo->{decomposition},  '');
+ok($charinfo->{decimal},        '');
+ok($charinfo->{digit},          '');
+ok($charinfo->{numeric},        '');
+ok($charinfo->{mirrored},       'N');
+ok($charinfo->{unicode10},      '');
+ok($charinfo->{comment},        '');
+ok($charinfo->{upper},          '');
+ok($charinfo->{lower},          '0061');
+ok($charinfo->{title},          '');
+ok($charinfo->{block},          'Basic Latin');
+ok($charinfo->{script},         'Latin');
+
+$charinfo = charinfo(0x100);
+
+ok($charinfo->{code},           '0100');
+ok($charinfo->{name},           'LATIN CAPITAL LETTER A WITH MACRON');
+ok($charinfo->{category},       'Lu');
+ok($charinfo->{combining},      '0');
+ok($charinfo->{bidi},           'L');
+ok($charinfo->{decomposition},  '0041 0304');
+ok($charinfo->{decimal},        '');
+ok($charinfo->{digit},          '');
+ok($charinfo->{numeric},        '');
+ok($charinfo->{mirrored},       'N');
+ok($charinfo->{unicode10},      'LATIN CAPITAL LETTER A MACRON');
+ok($charinfo->{comment},        '');
+ok($charinfo->{upper},          '');
+ok($charinfo->{lower},          '0101');
+ok($charinfo->{title},          '');
+ok($charinfo->{block},          'Latin Extended-A');
+ok($charinfo->{script},         'Latin');
  
  # 0x0590 is in the Hebrew block but unused.
  
-%charinfo = charinfo(0x590);
-
-ok($charinfo{code},          undef);
-ok($charinfo{name},          undef);
-ok($charinfo{category},      undef);
-ok($charinfo{combining},     undef);
-ok($charinfo{bidi},          undef);
-ok($charinfo{decomposition}, undef);
-ok($charinfo{decimal},       undef);
-ok($charinfo{digit},         undef);
-ok($charinfo{numeric},       undef);
-ok($charinfo{mirrored},      undef);
-ok($charinfo{unicode10},     undef);
-ok($charinfo{comment},       undef);
-ok($charinfo{upper},         undef);
-ok($charinfo{lower},         undef);
-ok($charinfo{title},         undef);
-ok($charinfo{block},         undef);
-ok($charinfo{script},        undef);
+$charinfo = charinfo(0x590);
+
+ok($charinfo->{code},          undef);
+ok($charinfo->{name},          undef);
+ok($charinfo->{category},      undef);
+ok($charinfo->{combining},     undef);
+ok($charinfo->{bidi},          undef);
+ok($charinfo->{decomposition}, undef);
+ok($charinfo->{decimal},       undef);
+ok($charinfo->{digit},         undef);
+ok($charinfo->{numeric},       undef);
+ok($charinfo->{mirrored},      undef);
+ok($charinfo->{unicode10},     undef);
+ok($charinfo->{comment},       undef);
+ok($charinfo->{upper},         undef);
+ok($charinfo->{lower},         undef);
+ok($charinfo->{title},         undef);
+ok($charinfo->{block},         undef);
+ok($charinfo->{script},        undef);
  
  # 0x05d0 is in the Hebrew block and used.
  
-%charinfo = charinfo(0x5d0);
-
-ok($charinfo{code},           '05D0');
-ok($charinfo{name},           'HEBREW LETTER ALEF');
-ok($charinfo{category},       'Lo');
-ok($charinfo{combining},      '0');
-ok($charinfo{bidi},           'R');
-ok($charinfo{decomposition},  '');
-ok($charinfo{decimal},        '');
-ok($charinfo{digit},          '');
-ok($charinfo{numeric},        '');
-ok($charinfo{mirrored},       'N');
-ok($charinfo{unicode10},      '');
-ok($charinfo{comment},        '');
-ok($charinfo{upper},          '');
-ok($charinfo{lower},          '');
-ok($charinfo{title},          '');
-ok($charinfo{block},          'Hebrew');
-ok($charinfo{script},         'Hebrew');
+$charinfo = charinfo(0x5d0);
+
+ok($charinfo->{code},           '05D0');
+ok($charinfo->{name},           'HEBREW LETTER ALEF');
+ok($charinfo->{category},       'Lo');
+ok($charinfo->{combining},      '0');
+ok($charinfo->{bidi},           'R');
+ok($charinfo->{decomposition},  '');
+ok($charinfo->{decimal},        '');
+ok($charinfo->{digit},          '');
+ok($charinfo->{numeric},        '');
+ok($charinfo->{mirrored},       'N');
+ok($charinfo->{unicode10},      '');
+ok($charinfo->{comment},        '');
+ok($charinfo->{upper},          '');
+ok($charinfo->{lower},          '');
+ok($charinfo->{title},          '');
+ok($charinfo->{block},          'Hebrew');
+ok($charinfo->{script},         'Hebrew');
  
  use Unicode::UCD qw(charblock charscript);
  
@@ -100,39 +100,39 @@ use Unicode::UCD qw(charblock charscript);
  ok(charblock(0x590),          'Hebrew');
  ok(charscript(0x590),         undef);
  
-%charinfo = charinfo(0xbe);
-
-ok($charinfo{code},           '00BE');
-ok($charinfo{name},           'VULGAR FRACTION THREE QUARTERS');
-ok($charinfo{category},       'No');
-ok($charinfo{combining},      '0');
-ok($charinfo{bidi},           'ON');
-ok($charinfo{decomposition},  '<fraction> 0033 2044 0034');
-ok($charinfo{decimal},        '');
-ok($charinfo{digit},          '');
-ok($charinfo{numeric},        '3/4');
-ok($charinfo{mirrored},       'N');
-ok($charinfo{unicode10},      'FRACTION THREE QUARTERS');
-ok($charinfo{comment},        '');
-ok($charinfo{upper},          '');
-ok($charinfo{lower},          '');
-ok($charinfo{title},          '');
-ok($charinfo{block},          'Latin-1 Supplement');
-ok($charinfo{script},         undef);
+$charinfo = charinfo(0xbe);
+
+ok($charinfo->{code},           '00BE');
+ok($charinfo->{name},           'VULGAR FRACTION THREE QUARTERS');
+ok($charinfo->{category},       'No');
+ok($charinfo->{combining},      '0');
+ok($charinfo->{bidi},           'ON');
+ok($charinfo->{decomposition},  '<fraction> 0033 2044 0034');
+ok($charinfo->{decimal},        '');
+ok($charinfo->{digit},          '');
+ok($charinfo->{numeric},        '3/4');
+ok($charinfo->{mirrored},       'N');
+ok($charinfo->{unicode10},      'FRACTION THREE QUARTERS');
+ok($charinfo->{comment},        '');
+ok($charinfo->{upper},          '');
+ok($charinfo->{lower},          '');
+ok($charinfo->{title},          '');
+ok($charinfo->{block},          'Latin-1 Supplement');
+ok($charinfo->{script},         undef);
  
  use Unicode::UCD qw(charblocks charscripts);
  
-my %charblocks = charblocks();
+my $charblocks = charblocks();
  
-ok(exists $charblocks{Thai});
-ok($charblocks{Thai}->[0]->[0], hex('0e00'));
-ok(!exists $charblocks{PigLatin});
+ok(exists $charblocks->{Thai});
+ok($charblocks->{Thai}->[0]->[0], hex('0e00'));
+ok(!exists $charblocks->{PigLatin});
  
-my %charscripts = charscripts();
+my $charscripts = charscripts();
  
-ok(exists $charscripts{Armenian});
-ok($charscripts{Armenian}->[0]->[0], hex('0531'));
-ok(!exists $charscripts{PigLatin});
+ok(exists $charscripts->{Armenian});
+ok($charscripts->{Armenian}->[0]->[0], hex('0531'));
+ok(!exists $charscripts->{PigLatin});
  
  my $charscript;
  
@@ -160,3 +160,48 @@ ok( charinrange($ranges, "13f4"));
  ok(!charinrange($ranges, "13f5"));
  
  ok(Unicode::UCD::UnicodeVersion, 3.1);
+
+use Unicode::UCD qw(compexcl);
+
+ok(!compexcl(0x0100));
+ok( compexcl(0x0958));
+
+use Unicode::UCD qw(casefold);
+
+my $casefold;
+
+$casefold = casefold(0x41);
+
+ok($casefold->{code} eq '0041' &&
+   $casefold->{status} eq 'C'  &&
+   $casefold->{mapping} eq '0061');
+
+$casefold = casefold(0xdf);
+
+ok($casefold->{code} eq '00DF' &&
+   $casefold->{status} eq 'F'  &&
+   $casefold->{mapping} eq '0073 0073');
+
+ok(!casefold(0x20));
+
+use Unicode::UCD qw(casespec);
+
+my $casespec;
+
+ok(!casespec(0x41));
+
+$casespec = casespec(0xdf);
+
+ok($casespec->{code} eq '00DF' &&
+   $casespec->{lower} eq '00DF'  &&
+   $casespec->{title} eq '0053 0073'  &&
+   $casespec->{upper} eq '0053 0053' &&
+   $casespec->{condition} eq undef);
+
+$casespec = casespec(0x307);
+
+ok($casespec->{code} eq '0307' &&
+   $casespec->{lower} eq '0307'  &&
+   $casespec->{title} eq ''  &&
+   $casespec->{upper} eq '' &&
+   $casespec->{condition} eq 'lt AFTER_i');
author	Jarkko Hietaniemi <jhi@iki.fi>
	Tue, 10 Jul 2001 15:00:37 +0000 (15:00 +0000)
committer	Jarkko Hietaniemi <jhi@iki.fi>
	Tue, 10 Jul 2001 15:00:37 +0000 (15:00 +0000)
lib/Unicode/UCD.pm		patch \| blob \| history
lib/Unicode/UCD.t		patch \| blob \| history