Unicode::Collate v0.09

author 貞廣知行 <bqw10602@nifty.com>

Mon, 12 Nov 2001 00:22:57 +0000 (09:22 +0900)

committer Abhijit Menon-Sen <ams@wiw.org>

Sun, 11 Nov 2001 16:45:00 +0000 (16:45 +0000)
author 貞廣知行 <bqw10602@nifty.com>
Mon, 12 Nov 2001 00:22:57 +0000 (09:22 +0900)
committer Abhijit Menon-Sen <ams@wiw.org>
Sun, 11 Nov 2001 16:45:00 +0000 (16:45 +0000)
diff --git a/lib/Unicode/Collate.pm b/lib/Unicode/Collate.pm

index 2ffda37..3393d43 100644 (file)
--- a/lib/Unicode/Collate.pm
+++ b/lib/Unicode/Collate.pm
@@ -4,9 +4,10 @@ use 5.006;
  use strict;
  use warnings;
  use Carp;
+
  require Exporter;
  
-our $VERSION = '0.08';
+our $VERSION = '0.09';
  our $PACKAGE = __PACKAGE__;
  
  our @ISA = qw(Exporter);
@@ -18,16 +19,7 @@ our @EXPORT = ();
  (our $Path = $INC{'Unicode/Collate.pm'}) =~ s/\.pm$//;
  our $KeyFile = "allkeys.txt";
  
-# Lingua::KO::Hangul::Util not part of the standard distribution
-# but it will be used if available.
-
-eval { require Lingua::KO::Hangul::Util };
-my $hasHangulUtil = ! $@;
-if ($hasHangulUtil) {
-    Lingua::KO::Hangul::Util->import();
-}
-
-our %Combin; # combining class from Unicode::Normalize
+our $getCombinClass; # coderef for combining class from Unicode::Normalize
  
  use constant Min2      => 0x20;   # minimum weight at level 2
  use constant Min3      => 0x02;   # minimum weight at level 3
@@ -47,14 +39,20 @@ sub new
       ! defined $self->{alternate} ? '' : $self->{alternate};
  
    # collation level
-  $self->{level} ||= ($self->{alternate} =~ /shift/ ? 4 : 3);
+  $self->{level} ||= ($self->{alternate} =~ /^shift/ ? 4 : 3);
  
    # normalization form
    $self->{normalization} = 'D' if ! exists $self->{normalization};
  
-  eval "use Unicode::Normalize;" if defined $self->{normalization};
+  if(defined $self->{normalization}){
+    eval "use Unicode::Normalize;";
+    croak "you'd install Unicode::Normalize for normalization forms: $@"
+       if $@;
+    $getCombinClass = \&Unicode::Normalize::getCombinClass
+       if ! $getCombinClass;
+  }
  
-  $self->{normalize} = 
+  $self->{UNF} = 
      ! defined $self->{normalization}        ? undef :
      $self->{normalization} =~ /^(?:NF)?C$/  ? \&NFC :
      $self->{normalization} =~ /^(?:NF)?D$/  ? \&NFD :
@@ -62,8 +60,6 @@ sub new
      $self->{normalization} =~ /^(?:NF)?KD$/ ? \&NFKD :
      croak "$PACKAGE unknown normalization form name: $self->{normalization}";
  
-  *Combin = \%Unicode::Normalize::Combin if $self->{normalize} && ! %Combin;
-
    # backwards
    $self->{backwards} ||= [];
    $self->{backwards} = [ $self->{backwards} ] if ! ref $self->{backwards};
@@ -188,7 +184,7 @@ sub splitCE
  {
    my $self = shift;
    my $code = $self->{preprocess};
-  my $norm = $self->{normalize};
+  my $norm = $self->{UNF};
    my $ent  = $self->{entries};
    my $max  = $self->{maxlength};
    my $rear = $self->{rearrangeHash};
@@ -229,15 +225,15 @@ sub splitCE
      }
      else {  $ch = pack('U', $u) }
  
-    if(%Combin && defined $ch) # with Combining Char
+  # with Combining Char (UTS#10, 4.2.1), here requires Unicode::Normalize.
+    if($getCombinClass && defined $ch)
      {
-      for(my $j = $i+1; $j < @src && $Combin{ $src[$j] }; $j++)
+      for(my $j = $i+1; $j < @src && $getCombinClass->( $src[$j] ); $j++)
        {
          my $comb = pack 'U', $src[$j];
          next if ! $ent->{ $ch.$comb };
          $ch .= $comb;
          splice(@src, $j, 1);
-        last;
        }
      }
      push @buf, $ch;
@@ -264,10 +260,7 @@ sub getWt
      _isHangul($u)
        ? $hang
          ? &$hang($u)
-        : ($hasHangulUtil ?
-              map(@{ $ent->{pack('U', $_)} }, decomposeHangul($u)) :
-             # runtime compile error...
-              (eval 'use Lingua::KO::Hangul::Util', print $@))
+        : map(@{ $ent->{pack('U', $_)} }, _decompHangul($u))
        : _isCJK($u)
          ? $cjk ? &$cjk($u) : map($self->altCE(0,@$_), _CJK($u))
          : map($self->altCE(0,@$_), _derivCE($u));
@@ -383,13 +376,13 @@ sub getSortKey
  ##
  ## int compare = cmp(string a, string b)
  ##
-sub cmp
-{
-  my $obj = shift;
-  my $a   = shift;
-  my $b   = shift;
-  $obj->getSortKey($a) cmp $obj->getSortKey($b);
-}
+sub cmp { $_[0]->getSortKey($_[1]) cmp $_[0]->getSortKey($_[2]) }
+sub eq  { $_[0]->getSortKey($_[1]) eq  $_[0]->getSortKey($_[2]) }
+sub ne  { $_[0]->getSortKey($_[1]) ne  $_[0]->getSortKey($_[2]) }
+sub lt  { $_[0]->getSortKey($_[1]) lt  $_[0]->getSortKey($_[2]) }
+sub le  { $_[0]->getSortKey($_[1]) le  $_[0]->getSortKey($_[2]) }
+sub gt  { $_[0]->getSortKey($_[1]) gt  $_[0]->getSortKey($_[2]) }
+sub ge  { $_[0]->getSortKey($_[1]) ge  $_[0]->getSortKey($_[2]) }
  
  ##
  ## list[strings] sorted = sort(list[strings] arg)
@@ -419,11 +412,7 @@ sub _derivCE
  ##
  ## "hhhh hhhh hhhh" to (dddd, dddd, dddd)
  ##
-sub _getHexArray
-{
-  my $str = shift;
-  map hex(), $str =~ /([0-9a-fA-F]+)/g;
-}
+sub _getHexArray { map hex(), $_[0] =~ /([0-9a-fA-F]+)/g }
  
  ##
  ## bool is_a_CJK_Unified_Ideograph = _isCJK(int codepoint)
@@ -439,19 +428,25 @@ sub _isCJK
  ##
  ## list[arrayref] CE = _CJK(int codepoint_of_CJK)
  ##
-sub _CJK
-{
-  my $u = shift;
-  $u > 0xFFFF ? _derivCE($u) : [$u,0x20,0x02,$u];
-}
+sub _CJK { $_[0] > 0xFFFF ? _derivCE($_[0]) : [$_[0], 0x20, 0x02, $_[0]] }
  
  ##
  ## bool is_a_Hangul_Syllable = _isHangul(int codepoint) 
  ##
-sub _isHangul
-{
-  my $code = shift;
-  return 0xAC00 <= $code && $code <= 0xD7A3;
+sub _isHangul { return 0xAC00 <= $_[0] && $_[0] <= 0xD7A3 }
+
+sub _decompHangul {
+    my $code = shift;
+  # $code must be in Hangul syllable. check it before you enter here.
+    my $SIndex = $code - 0xAC00;
+    my $LIndex = int( $SIndex / 588);
+    my $VIndex = int(($SIndex % 588) / 28);
+    my $TIndex =      $SIndex % 28;
+    return (
+       0x1100 + $LIndex,
+       0x1161 + $VIndex,
+      $TIndex ? (0x11A7 + $TIndex) : (),
+    );
  }
  
  1;
@@ -466,13 +461,13 @@ Unicode::Collate - use UCA (Unicode Collation Algorithm)
    use Unicode::Collate;
  
    #construct
-  $UCA = Unicode::Collate->new(%tailoring);
+  $Collator = Unicode::Collate->new(%tailoring);
  
    #sort
-  @sorted = $UCA->sort(@not_sorted);
+  @sorted = $Collator->sort(@not_sorted);
  
    #compare
-  $result = $UCA->cmp($a, $b); # returns 1, 0, or -1. 
+  $result = $Collator->cmp($a, $b); # returns 1, 0, or -1. 
  
  =head1 DESCRIPTION
  
@@ -480,7 +475,7 @@ Unicode::Collate - use UCA (Unicode Collation Algorithm)
  
  The C<new> method returns a collator object.
  
-   $UCA = Unicode::Collate->new(
+   $Collator = Unicode::Collate->new(
        alternate => $alternate,
        backwards => $levelNumber, # or \@levelNumbers
        entry => $element,
@@ -499,7 +494,7 @@ The C<new> method returns a collator object.
        upper_before_lower => $bool,
     );
     # if %tailoring is false (empty),
-   # $UCA should do the default collation.
+   # $Collator should do the default collation.
  
  =over 4
  
@@ -675,17 +670,38 @@ If the parameter is true, this is reversed.
  
  =over 4
  
-=item C<@sorted = $UCA-E<gt>sort(@not_sorted)>
+=item C<@sorted = $Collator-E<gt>sort(@not_sorted)>
  
  Sorts a list of strings.
  
-=item C<$result = $UCA-E<gt>cmp($a, $b)>
+=item C<$result = $Collator-E<gt>cmp($a, $b)>
  
  Returns 1 (when C<$a> is greater than C<$b>)
  or 0 (when C<$a> is equal to C<$b>)
  or -1 (when C<$a> is lesser than C<$b>).
  
-=item C<$sortKey = $UCA-E<gt>getSortKey($string)>
+=item C<$result = $Collator-E<gt>eq($a, $b)>
+
+=item C<$result = $Collator-E<gt>ne($a, $b)>
+
+=item C<$result = $Collator-E<gt>lt($a, $b)>
+
+=item C<$result = $Collator-E<gt>le($a, $b)>
+
+=item C<$result = $Collator-E<gt>gt($a, $b)>
+
+=item C<$result = $Collator-E<gt>ge($a, $b)>
+
+They works like the same name operators as theirs. 
+
+   eq : whether $a is equal to $b.
+   ne : whether $a is not equal to $b.
+   lt : whether $a is lesser than $b.
+   le : whether $a is lesser than $b or equal to $b.
+   gt : whether $a is greater than $b.
+   ge : whether $a is greater than $b or equal to $b.
+
+=item C<$sortKey = $Collator-E<gt>getSortKey($string)>
  
  -- see 4.3 Form a sort key for each string, UTR #10.
  
@@ -694,15 +710,15 @@ Returns a sort key.
  You compare the sort keys using a binary comparison
  and get the result of the comparison of the strings using UCA.
  
-   $UCA->getSortKey($a) cmp $UCA->getSortKey($b)
+   $Collator->getSortKey($a) cmp $Collator->getSortKey($b)
  
        is equivalent to
  
-   $UCA->cmp($a, $b)
+   $Collator->cmp($a, $b)
  
-=item C<$position = $UCA-E<gt>index($string, $substring)>
+=item C<$position = $Collator-E<gt>index($string, $substring)>
  
-=item C<($position, $length) = $UCA-E<gt>index($string, $substring)>
+=item C<($position, $length) = $Collator-E<gt>index($string, $substring)>
  
  -- see 6.8 Searching, UTR #10.
  
@@ -727,12 +743,12 @@ an empty list in list context.
  
  e.g. you say
  
-  my $UCA = Unicode::Collate->new( normalization => undef, level => 1 );
+  my $Collator = Unicode::Collate->new( normalization => undef, level => 1 );
    my $str = "Ich mu\x{00DF} studieren.";
    my $sub = "m\x{00FC}ss";
    my $match;
-  if(my @tmp = $UCA->index($str, $sub)){
-    $match = substr($str, $tmp[0], $tmp[1]);
+  if(my($pos,$len) = $Collator->index($str, $sub)){
+      $match = substr($str, $pos, $len);
    }
  
  and get C<"mu\x{00DF}"> in C<$match> since C<"mu>E<223>C<">
@@ -749,10 +765,12 @@ None by default.
  Use of the C<normalization> parameter requires
  the B<Unicode::Normalize> module.
  
-If you need not it (e.g. in the case when you need not
+If you need not it (say, in the case when you need not
  handle any combining characters),
  assign C<normalization =E<gt> undef> explicitly.
  
+-- see 6.5 Avoiding Normalization, UTR #10.
+
  =head1 AUTHOR
  
  SADAHIRO Tomoyuki, E<lt>SADAHIRO@cpan.orgE<gt>
@@ -768,18 +786,14 @@ SADAHIRO Tomoyuki, E<lt>SADAHIRO@cpan.orgE<gt>
  
  =over 4
  
-=item L<Lingua::KO::Hangul::Util>
+=item Unicode Collation Algorithm - Unicode TR #10
  
-utility functions for Hangul Syllables
+http://www.unicode.org/unicode/reports/tr10/
  
  =item L<Unicode::Normalize>
  
  normalized forms of Unicode text
  
-=item Unicode Collation Algorithm - Unicode TR #10
-
-http://www.unicode.org/unicode/reports/tr10/
-
  =back
  
  =cut
diff --git a/lib/Unicode/Collate/Changes b/lib/Unicode/Collate/Changes

index 9b4ed1b..3d6acdb 100644 (file)
--- a/lib/Unicode/Collate/Changes
+++ b/lib/Unicode/Collate/Changes
@@ -1,5 +1,14 @@
  Revision history for Perl extension Unicode::Collate.
  
+0.09  Sun Nov 11 17:02:40:18 2001
+    - add the following methods: eq, ne, lt, le, gt, le.
+    - relies on &Unicode::Normalize::getCombinClass()
+      in place of %Unicode::Normalize::Combin
+      (the hash is not defined in the XS version of Unicode::Normalize).
+      then you should install Unicode::Normalize 0.10 or later.
+    - now independent of Lingua::KO::Hangul::Util
+      (this module does decomposition of Hangul syllables for itself)
+
  0.08  Mon Aug 20 22:40:18 2001
      - add the index method.
  
diff --git a/lib/Unicode/Collate/README b/lib/Unicode/Collate/README

index 0cf2bc3..c84a73c 100644 (file)
--- a/lib/Unicode/Collate/README
+++ b/lib/Unicode/Collate/README
@@ -1,4 +1,4 @@
-Unicode/Collate version 0.08
+Unicode/Collate version 0.09
  ===============================
  
  use UCA (Unicode Collation Algorithm)
@@ -17,13 +17,15 @@ SYNOPSIS
    use Unicode::Collate;
  
    #construct
-  $UCA = Unicode::Collate->new(%tailoring);
+  $Collator = Unicode::Collate->new(%tailoring);
  
    #sort
-  @sorted = $UCA->sort(@not_sorted);
+  @sorted = $Collator->sort(@not_sorted);
  
    #compare
-  $result = $UCA->cmp($a, $b); # returns 1, 0, or -1.
+  $result = $Collator->cmp($a, $b); # returns 1, 0, or -1.
+  $result = $Collator->eq($a, $b);  # returns true/false
+    (similarly ->ne, ->lt, ->le, ->gt, ->ge)
  
  INSTALLATION
  
@@ -38,13 +40,7 @@ To install this module type the following:
  
  DEPENDENCIES
  
-This module requires these other modules and libraries:
-
-Carp
-Exporter
-Lingua::KO::Hangul::Util
-
-It's better if Unicode::Normalize has been installed
+It's better if you have Unicode::Normalize (v 0.10 or later)
  although Unicode::Collate can be used without Unicode::Normalize.
  
  COPYRIGHT AND LICENCE
diff --git a/lib/Unicode/Collate/t/test.t b/lib/Unicode/Collate/t/test.t

index 352c44f..48bf412 100644 (file)
--- a/lib/Unicode/Collate/t/test.t
+++ b/lib/Unicode/Collate/t/test.t
@@ -4,21 +4,21 @@
  #########################
  
  use Test;
-BEGIN { plan tests => 22 };
+BEGIN { plan tests => 50 };
  use Unicode::Collate;
  ok(1); # If we made it this far, we're ok.
  
  #########################
  
-my $UCA = Unicode::Collate->new(
+my $Collator = Unicode::Collate->new(
    table => 'keys.txt',
    normalization => undef,
  );
  
-ok(ref $UCA, "Unicode::Collate");
+ok(ref $Collator, "Unicode::Collate");
  
  ok(
-  join(':', $UCA->sort( 
+  join(':', $Collator->sort( 
      qw/ lib strict Carp ExtUtils CGI Time warnings Math overload Pod CPAN /
    ) ),
    join(':',
@@ -29,10 +29,13 @@ ok(
  my $A_acute = pack('U', 0x00C1);
  my $acute   = pack('U', 0x0301);
  
-ok($UCA->cmp("A$acute", $A_acute), -1);
+ok($Collator->cmp("A$acute", $A_acute), -1);
  
-ok($UCA->cmp("", ""), 0);
-ok($UCA->cmp("", "perl"), -1);
+ok($Collator->cmp("", ""), 0);
+ok(! $Collator->ne("", "") );
+ok(  $Collator->eq("", "") );
+
+ok($Collator->cmp("", "perl"), -1);
  
  eval "use Unicode::Normalize";
  
@@ -53,6 +56,7 @@ my $tr = Unicode::Collate->new(
    entry => <<'ENTRIES',
  0063 0068 ; [.0893.0020.0002.0063]  # "ch" in traditional Spanish
  0043 0068 ; [.0893.0020.0008.0043]  # "Ch" in traditional Spanish
+00DF ; [.09F3.0154.0004.00DF] [.09F3.0020.0004.00DF] # eszet in Germany
  ENTRIES
  );
  
@@ -66,7 +70,7 @@ ok(
  );
  
  ok(
-  join(':', $UCA->sort( 
+  join(':', $Collator->sort( 
      qw/ acha aca ada acia acka /
    ) ),
    join(':',
@@ -74,36 +78,60 @@ ok(
    ),
  );
  
-my $old_level = $UCA->{level};
+my $old_level = $Collator->{level};
  my $hiragana = "\x{3042}\x{3044}";
  my $katakana = "\x{30A2}\x{30A4}";
  
-$UCA->{level} = 2;
+$Collator->{level} = 2;
+
+ok( $Collator->cmp("ABC","abc"), 0);
+ok( $Collator->eq("ABC","abc") );
+ok( $Collator->le("ABC","abc") );
+ok( $Collator->cmp($hiragana, $katakana), 0);
+ok( $Collator->eq($hiragana, $katakana) );
+ok( $Collator->ge($hiragana, $katakana) );
  
-ok( $UCA->cmp("ABC","abc"), 0);
-ok( $UCA->cmp($hiragana, $katakana), 0);
+# hangul
+ok( $Collator->eq("a\x{AC00}b", "a\x{1100}\x{1161}b") );
+ok( $Collator->eq("a\x{AE00}b", "a\x{1100}\x{1173}\x{11AF}b") );
+ok( $Collator->gt("a\x{AE00}b", "a\x{1100}\x{1173}b\x{11AF}") );
+ok( $Collator->lt("a\x{AC00}b", "a\x{AE00}b") );
+ok( $Collator->gt("a\x{D7A3}b", "a\x{C544}b") );
+ok( $Collator->lt("a\x{C544}b", "a\x{30A2}b") ); # hangul < hiragana
  
-$UCA->{level} = $old_level;
+$Collator->{level} = $old_level;
  
-$UCA->{katakana_before_hiragana} = 1;
+$Collator->{katakana_before_hiragana} = 1;
  
-ok( $UCA->cmp("abc", "ABC"), -1);
-ok( $UCA->cmp($hiragana, $katakana), 1);
+ok( $Collator->cmp("abc", "ABC"), -1);
+ok( $Collator->ne("abc", "ABC") );
+ok( $Collator->lt("abc", "ABC") );
+ok( $Collator->le("abc", "ABC") );
+ok( $Collator->cmp($hiragana, $katakana), 1);
+ok( $Collator->ne($hiragana, $katakana) );
+ok( $Collator->gt($hiragana, $katakana) );
+ok( $Collator->ge($hiragana, $katakana) );
  
-$UCA->{upper_before_lower} = 1;
+$Collator->{upper_before_lower} = 1;
  
-ok( $UCA->cmp("abc", "ABC"), 1);
-ok( $UCA->cmp($hiragana, $katakana), 1);
+ok( $Collator->cmp("abc", "ABC"), 1);
+ok( $Collator->ge("abc", "ABC"), 1);
+ok( $Collator->gt("abc", "ABC"), 1);
+ok( $Collator->cmp($hiragana, $katakana), 1);
+ok( $Collator->ge($hiragana, $katakana), 1);
+ok( $Collator->gt($hiragana, $katakana), 1);
  
-$UCA->{katakana_before_hiragana} = 0;
+$Collator->{katakana_before_hiragana} = 0;
  
-ok( $UCA->cmp("abc", "ABC"), 1);
-ok( $UCA->cmp($hiragana, $katakana), -1);
+ok( $Collator->cmp("abc", "ABC"), 1);
+ok( $Collator->cmp($hiragana, $katakana), -1);
  
-$UCA->{upper_before_lower} = 0;
+$Collator->{upper_before_lower} = 0;
  
-ok( $UCA->cmp("abc", "ABC"), -1);
-ok( $UCA->cmp($hiragana, $katakana), -1);
+ok( $Collator->cmp("abc", "ABC"), -1);
+ok( $Collator->le("abc", "ABC") );
+ok( $Collator->cmp($hiragana, $katakana), -1);
+ok( $Collator->lt($hiragana, $katakana) );
  
  my $ign = Unicode::Collate->new(
    table => 'keys.txt',
@@ -113,27 +141,60 @@ my $ign = Unicode::Collate->new(
  
  ok( $ign->cmp("element","lament"), 0);
  
-$UCA->{level} = 2;
+$Collator->{level} = 2;
  
-my $orig = "This is a Perl book.";
  my $str;
+
+my $orig = "This is a Perl book.";
  my $sub = "PERL";
  my $rep = "camel";
  my $ret = "This is a camel book.";
  
  $str = $orig;
-if(my @tmp = $UCA->index($str, $sub)){
-  substr($str, $tmp[0], $tmp[1], $rep);
+if(my($pos,$len) = $Collator->index($str, $sub)){
+  substr($str, $pos, $len, $rep);
  }
  
  ok($str, $ret);
  
-$UCA->{level} = $old_level;
+$Collator->{level} = $old_level;
  
  $str = $orig;
-if(my @tmp = $UCA->index($str, $sub)){
-  substr($str, $tmp[0], $tmp[1], $rep);
+if(my($pos,$len) = $Collator->index($str, $sub)){
+  substr($str, $pos, $len, $rep);
  }
  
  ok($str, $orig);
  
+$tr->{level} = 1;
+
+$str = "Ich mu\x{00DF} studieren.";
+$sub = "m\x{00FC}ss";
+my $match = undef;
+if(my($pos, $len) = $tr->index($str, $sub)){
+    $match = substr($str, $pos, $len);
+}
+ok($match, "mu\x{00DF}");
+
+$tr->{level} = $old_level;
+
+$str = "Ich mu\x{00DF} studieren.";
+$sub = "m\x{00FC}ss";
+$match = undef;
+if(my($pos, $len) = $tr->index($str, $sub)){
+    $match = substr($str, $pos, $len);
+}
+ok($match, undef);
+
+$match = undef;
+if(my($pos,$len) = $Collator->index("", "")){
+    $match = substr("", $pos, $len);
+}
+ok($match, "");
+
+$match = undef;
+if(my($pos,$len) = $Collator->index("", "abc")){
+    $match = substr("", $pos, $len);
+}
+ok($match, undef);
+
author	貞廣知行 <bqw10602@nifty.com>
	Mon, 12 Nov 2001 00:22:57 +0000 (09:22 +0900)
committer	Abhijit Menon-Sen <ams@wiw.org>
	Sun, 11 Nov 2001 16:45:00 +0000 (16:45 +0000)
lib/Unicode/Collate.pm		patch \| blob \| history
lib/Unicode/Collate/Changes		patch \| blob \| history
lib/Unicode/Collate/README		patch \| blob \| history
lib/Unicode/Collate/t/test.t		patch \| blob \| history