Update Unicode-Collate to CPAN version 0.92

author Chris 'BinGOs' Williams <chris@bingosnet.co.uk>

Wed, 14 Nov 2012 14:46:15 +0000 (14:46 +0000)

committer Chris 'BinGOs' Williams <chris@bingosnet.co.uk>

Wed, 14 Nov 2012 14:46:15 +0000 (14:46 +0000)
author Chris 'BinGOs' Williams <chris@bingosnet.co.uk>
Wed, 14 Nov 2012 14:46:15 +0000 (14:46 +0000)
committer Chris 'BinGOs' Williams <chris@bingosnet.co.uk>
Wed, 14 Nov 2012 14:46:15 +0000 (14:46 +0000)
diff --git a/MANIFEST b/MANIFEST

index 41a3a74..722d002 100644 (file)
--- a/MANIFEST
+++ b/MANIFEST
@@ -2814,6 +2814,7 @@ cpan/Unicode-Collate/t/contract.t         Unicode::Collate
  cpan/Unicode-Collate/t/default.t               Unicode::Collate
  cpan/Unicode-Collate/t/hangtype.t              Unicode::Collate
  cpan/Unicode-Collate/t/hangul.t                        Unicode::Collate
+cpan/Unicode-Collate/t/ident.t
  cpan/Unicode-Collate/t/iglevel2.t                      Unicode::Collate
  cpan/Unicode-Collate/t/ignor.t                 Unicode::Collate
  cpan/Unicode-Collate/t/illegalp.t              Unicode::Collate
diff --git a/Porting/Maintainers.pl b/Porting/Maintainers.pl

index c5cffa8..93eaf9d 100755 (executable)
--- a/Porting/Maintainers.pl
+++ b/Porting/Maintainers.pl
@@ -2011,7 +2011,7 @@ use File::Glob qw(:case);
  
      'Unicode::Collate' => {
          'MAINTAINER'   => 'sadahiro',
-        'DISTRIBUTION' => 'SADAHIRO/Unicode-Collate-0.91.tar.gz',
+        'DISTRIBUTION' => 'SADAHIRO/Unicode-Collate-0.92.tar.gz',
          'FILES'        => q[cpan/Unicode-Collate],
          'EXCLUDED'     => [
              qr{N$},
diff --git a/cpan/Unicode-Collate/Changes b/cpan/Unicode-Collate/Changes

index 71e8646..7225c3b 100644 (file)
--- a/cpan/Unicode-Collate/Changes
+++ b/cpan/Unicode-Collate/Changes
@@ -1,5 +1,17 @@
  Revision history for Perl module Unicode::Collate.
  
+0.92  Wed Nov 14 20:58:19 2012
+    - fix: index() etc. with preprocess/normalization should be always croaked.
+    - doc: referred to the latest UTS #10 and updated its section numbers.
+    - supported the identical level (see 'identical' in POD).
+    - Now UCA_Version 26 (for Unicode 6.2.0) is supported.
+    - added ident.t in t.
+    - modified tests: cjkrange.t, compatui.t, hangtype.t, index.t,
+      overcjk0.t, overcjk1.t, test.t, view.t in t.
+
+    * But the default UCA_Version is still 24.
+     (In the next release, UCA_Version 26 will be the default.)
+
  0.91  Sun Nov  4 17:00:20 2012
      - XSUB: use PERL_NO_GET_CONTEXT (see perlguts)
        (see [rt.cpan.org #80313])
@@ -310,7 +322,7 @@ Revision history for Perl module Unicode::Collate.
  
  0.29  Mon Oct 13 12:18:23 2003
      - now UCA Version 11 (but no functionality is different from Version 9).
-    - supported hangul_terminator.
+    - supported 'hangul_terminator'.
      - fix: Base_Unicode_Version falsely returns Perl's Unicode version.
        C4 in UTS #10 requires UTS's Unicode version.
      - For variable weighting, 'variable' is recommended
diff --git a/cpan/Unicode-Collate/Collate.pm b/cpan/Unicode-Collate/Collate.pm

index 5964f83..9e1623c 100644 (file)
--- a/cpan/Unicode-Collate/Collate.pm
+++ b/cpan/Unicode-Collate/Collate.pm
@@ -14,7 +14,7 @@ use File::Spec;
  
  no warnings 'utf8';
  
-our $VERSION = '0.91';
+our $VERSION = '0.92';
  our $PACKAGE = __PACKAGE__;
  
  ### begin XS only ###
@@ -48,16 +48,14 @@ use constant Min3Wt => 0x02;
  use constant Shift4Wt => 0xFFFF;
  
  # A boolean for Variable and 16-bit weights at 4 levels of Collation Element
-# PROBLEM: The Default Unicode Collation Element Table
-# has weights over 0xFFFF at the 4th level.
-# The tie-breaking in the variable weights
-# other than "shift" (as well as "shift-trimmed") is unreliable.
  use constant VCE_TEMPLATE => 'Cn4';
  
  # A sort key: 16-bit weights
-# See also the PROBLEM on VCE_TEMPLATE above.
  use constant KEY_TEMPLATE => 'n*';
  
+# The tie-breaking: 32-bit weights
+use constant TIE_TEMPLATE => 'N*';
+
  # Level separator in a sort key:
  # i.e. pack(KEY_TEMPLATE, 0)
  use constant LEVEL_SEP => "\0\0";
@@ -105,7 +103,7 @@ our @ChangeOK = qw/
      alternate backwards level normalization rearrange
      katakana_before_hiragana upper_before_lower ignore_level2
      overrideHangul overrideCJK preprocess UCA_Version
-    hangul_terminator variable
+    hangul_terminator variable identical
    /;
  
  our @ChangeNG = qw/
@@ -135,18 +133,18 @@ sub change {
      my $self = shift;
      my %hash = @_;
      my %old;
-    if (exists $hash{variable} && exists $hash{alternate}) {
-       delete $hash{alternate};
-    }
-    elsif (!exists $hash{variable} && exists $hash{alternate}) {
-       $hash{variable} = $hash{alternate};
+    if (exists $hash{alternate}) {
+       if (exists $hash{variable}) {
+           delete $hash{alternate};
+       } else {
+           $hash{variable} = $hash{alternate};
+       }
      }
      foreach my $k (keys %hash) {
         if (exists $ChangeOK{$k}) {
             $old{$k} = $self->{$k};
             $self->{$k} = $hash{$k};
-       }
-       elsif (exists $ChangeNG{$k}) {
+       } elsif (exists $ChangeNG{$k}) {
             croak "change of $k via change() is not allowed!";
         }
         # else => ignored
@@ -176,6 +174,7 @@ my %DerivCode = (
     20 => \&_derivCE_20,
     22 => \&_derivCE_22,
     24 => \&_derivCE_24,
+   26 => \&_derivCE_24, # 26 == 24
  );
  
  sub checkCollator {
@@ -193,12 +192,10 @@ sub checkCollator {
  
      if (! defined $self->{backwards}) {
         $self->{backwardsFlag} = 0;
-    }
-    elsif (! ref $self->{backwards}) {
+    } elsif (! ref $self->{backwards}) {
         _checkLevel($self->{backwards}, "backwards");
         $self->{backwardsFlag} = 1 << $self->{backwards};
-    }
-    else {
+    } else {
         my %level;
         $self->{backwardsFlag} = 0;
         for my $b (@{ $self->{backwards} }) {
@@ -443,21 +440,33 @@ sub parseEntry
  sub viewSortKey
  {
      my $self = shift;
-    $self->visualizeSortKey($self->getSortKey(@_));
+    my $str  = shift;
+    $self->visualizeSortKey($self->getSortKey($str));
  }
  
  
+sub process
+{
+    my $self = shift;
+    my $str  = shift;
+    my $prep = $self->{preprocess};
+    my $norm = $self->{normCode};
+
+    $str = &$prep($str) if ref $prep;
+    $str = &$norm($str) if ref $norm;
+    return $str;
+}
+
  ##
  ## arrayref of JCPS   = splitEnt(string to be collated)
-## arrayref of arrayref[JCPS, ini_pos, fin_pos] = splitEnt(string, true)
+## arrayref of arrayref[JCPS, ini_pos, fin_pos] = splitEnt(string, TRUE)
  ##
  sub splitEnt
  {
      my $self = shift;
-    my $wLen = $_[1];
+    my $str  = shift;
+    my $wLen = shift; # with Length
  
-    my $code = $self->{preprocess};
-    my $norm = $self->{normCode};
      my $map  = $self->{mapping};
      my $max  = $self->{maxlength};
      my $reH  = $self->{rearrangeHash};
@@ -465,20 +474,7 @@ sub splitEnt
      my $ver9 = $vers >= 9 && $vers <= 11;
      my $uXS  = $self->{__useXS}; ### XS only
  
-    my ($str, @buf);
-
-    if ($wLen) {
-       $code and croak "Preprocess breaks character positions. "
-                       . "Don't use with index(), match(), etc.";
-       $norm and croak "Normalization breaks character positions. "
-                       . "Don't use with index(), match(), etc.";
-       $str = $_[0];
-    }
-    else {
-       $str = $_[0];
-       $str = &$code($str) if ref $code;
-       $str = &$norm($str) if ref $norm;
-    }
+    my @buf;
  
      # get array of Unicode code point of string.
      my @src = unpack_U($str);
@@ -696,9 +692,13 @@ sub getWt
  sub getSortKey
  {
      my $self = shift;
-    my $rEnt = $self->splitEnt(shift); # get an arrayref of JCPS
+    my $orig = shift;
+    my $str  = $self->process($orig);
+    my $rEnt = $self->splitEnt($str); # get an arrayref of JCPS
      my $vers = $self->{UCA_Version};
      my $term = $self->{hangul_terminator};
+    my $lev  = $self->{level};
+    my $iden = $self->{identical};
  
      my @buf; # weight arrays
      if ($term) {
@@ -723,7 +723,13 @@ sub getSortKey
         }
      }
  
-    return $self->mk_SortKey(\@buf); ### XS only
+    my $rkey = $self->mk_SortKey(\@buf); ### XS only
+
+    if ($iden || $vers >= 26 && $lev == MaxLevel) {
+       $rkey .= LEVEL_SEP;
+       $rkey .= pack(TIE_TEMPLATE, unpack_U($str)) if $iden;
+    }
+    return $rkey;
  }
  
  
@@ -798,9 +804,15 @@ sub _eqArray($$$)
  sub index
  {
      my $self = shift;
+    $self->{preprocess} and
+       croak "Don't use Preprocess with index(), match(), etc.";
+    $self->{normCode} and
+       croak "Don't use Normalization with index(), match(), etc.";
+
      my $str  = shift;
      my $len  = length($str);
-    my $subE = $self->splitEnt(shift);
+    my $sub  = shift;
+    my $subE = $self->splitEnt($sub);
      my $pos  = @_ ? shift : 0;
         $pos  = 0 if $pos < 0;
      my $glob = shift;
@@ -1034,6 +1046,7 @@ with no parameters, the collator should do the default collation.
        backwards => $levelNumber, # or \@levelNumbers
        entry => $element,
        hangul_terminator => $term_primary_weight,
+      identical => $bool,
        ignoreName => qr/$ignoreName/,
        ignoreChar => qr/$ignoreChar/,
        ignore_level2 => $bool,
@@ -1074,6 +1087,7 @@ The following revisions are supported.  The default is 24.
       20             5.2.0               5.2.0 (5.2.0)
       22             6.0.0               6.0.0 (6.0.0)
       24             6.1.0               6.1.0 (6.1.0)
+     26             6.2.0               6.2.0 (6.2.0)
  
  * Noncharacters (e.g. U+FFFF) are not ignored, and can be overridden
  since C<UCA_Version> 22.
@@ -1099,7 +1113,7 @@ as an alias for C<variable>.
  
  =item backwards
  
--- see 3.1.2 French Accents, UTS #10.
+-- see 3.4 Backward Accents, UTS #10.
  
       backwards => $levelNumber or \@levelNumbers
  
@@ -1109,7 +1123,7 @@ forwards at all the levels.
  
  =item entry
  
--- see 3.1 Linguistic Features; 3.2.1 File Format, UTS #10.
+-- see 5 Tailoring; 3.6.1 File Format, UTS #10.
  
  If the same character (or a sequence of characters) exists
  in the collation element table through C<table>,
@@ -1183,11 +1197,27 @@ automatically terminated with a terminator primary weight.
  These characters may need terminator included in a collation element
  table beforehand.
  
+=item identical
+
+-- see A.3 Deterministic Comparison, UTS #10.
+
+By default, strings whose weights are equal should be equal,
+even though their code points are not equal.
+
+If the parameter is made true, a final, tie-breaking level is used.
+If no difference of weights is found after the comparison through all
+the level (independent of the value of C<level>), the comparison with
+code points will be performed. For the tie-breaking comparision,
+the sort key has code points of the original string appended.
+
+If C<preprocess> and/or C<normalization> is applied, the code points
+of the string after them (in NFD by default) are used.
+
  =item ignoreChar
  
  =item ignoreName
  
--- see 3.2.2 Variable Weighting, UTS #10.
+-- see 3.6.2 Variable Weighting, UTS #10.
  
  Makes the entry in the table completely ignorable;
  i.e. as if the weights were zero at all level.
@@ -1214,7 +1244,7 @@ B<NOTE>: C<level> should be 3 or greater.
  
  =item katakana_before_hiragana
  
--- see 7.3.1 Tertiary Weight Table, UTS #10.
+-- see 7.2 Tertiary Weight Table, UTS #10.
  
  By default, hiragana is before katakana.
  If the parameter is made true, this is reversed.
@@ -1241,6 +1271,13 @@ Any higher levels than the specified one are ignored.
  
  If omitted, the maximum is the 4th.
  
+B<NOTE:> The DUCET includes weights over 0xFFFF at the 4th level.
+But this module only uses weights within 0xFFFF.
+When C<variable> is 'blanked' or 'non-ignorable' (other than 'shifted'
+and 'shift-trimmed'), the level 4 may be unreliable.
+
+See also C<identical>.
+
  =item normalization
  
  -- see 4.1 Normalize, UTS #10.
@@ -1295,7 +1332,7 @@ those in the CJK Unified Ideographs Extension A etc.
      U+4E00..U+9FBB if UCA_Version is 14 or 16.
      U+4E00..U+9FC3 if UCA_Version is 18.
      U+4E00..U+9FCB if UCA_Version is 20 or 22.
-    U+4E00..U+9FCC if UCA_Version is 24.
+    U+4E00..U+9FCC if UCA_Version is 24 or 26.
  
      In the CJK Unified Ideographs Extension blocks:
      Ext.A (U+3400..U+4DB5) and Ext.B (U+20000..U+2A6D6) in any UCA_Version.
@@ -1373,7 +1410,7 @@ in C<table> or C<entry> is still valid.
  
  =item preprocess
  
--- see 5.1 Preprocessing, UTS #10.
+-- see 5.4 Preprocessing, UTS #10.
  
  If specified, the coderef is used to preprocess each string
  before the formation of sort keys.
@@ -1402,7 +1439,7 @@ L<perluniintro>, L<perlunitut>, L<perlunifaq>, L<utf8>.
  
  =item rearrange
  
--- see 3.1.3 Rearrangement, UTS #10.
+-- see 3.5 Rearrangement, UTS #10.
  
  Characters that are not coded in logical order and to be rearranged.
  If C<UCA_Version> is equal to or lesser than 11, default is:
@@ -1458,7 +1495,7 @@ B<NOTE>: Contractions via C<entry> are not be suppressed.
  
  =item table
  
--- see 3.2 Default Unicode Collation Element Table, UTS #10.
+-- see 3.6 Default Unicode Collation Element Table, UTS #10.
  
  You can use another collation element table if desired.
  
@@ -1537,7 +1574,7 @@ this parameter doesn't work validly.
  
  =item variable
  
--- see 3.2.2 Variable Weighting, UTS #10.
+-- see 3.6.2 Variable Weighting, UTS #10.
  
  This key allows for variable weighting of variable collation elements,
  which are marked with an ASTERISK in the table
@@ -1861,6 +1898,11 @@ a collator via C<Unicode::Collate-E<gt>new( )> should be used;
  for F<CollationTest_NON_IGNORABLE.txt>, a collator via
  C<Unicode::Collate-E<gt>new(variable =E<gt> "non-ignorable", level =E<gt> 3)>.
  
+If C<UCA_Version> is 26 or later, the C<identical> level is preferred;
+C<Unicode::Collate-E<gt>new(identical =E<gt> 1)> and
+C<Unicode::Collate-E<gt>new(identical =E<gt> 1,>
+C<variable =E<gt> "non-ignorable", level =E<gt> 3)> should be used.
+
  B<Unicode::Normalize is required to try The Conformance Test.>
  
  =back
diff --git a/cpan/Unicode-Collate/Collate.xs b/cpan/Unicode-Collate/Collate.xs

index 94ff6e4..4d4ecca 100644 (file)
--- a/cpan/Unicode-Collate/Collate.xs
+++ b/cpan/Unicode-Collate/Collate.xs
@@ -617,10 +617,14 @@ varCE (self, vce)
      else if (*a == 's') { /* shifted or shift-trimmed */
         totwt = d[1] + d[2] + d[3] + d[4] + d[5] + d[6];
         if (alen == 7 && totwt != 0) { /* shifted */
-           d[7] = (U8)(Shift4Wt >> 8);
-           d[8] = (U8)(Shift4Wt & 0xFF);
-       }
-       else { /* shift-trimmed */
+           if (d[1] == 0 && d[2] == 1) { /* XXX: CollationAuxiliary-6.2.0 */
+               d[7] = d[1]; /* wt level 1 to 4 */
+               d[8] = d[2];
+           } else {
+               d[7] = (U8)(Shift4Wt >> 8);
+               d[8] = (U8)(Shift4Wt & 0xFF);
+           }
+       } else { /* shift-trimmed */
             d[7] = d[8] = '\0';
         }
      }
@@ -642,7 +646,7 @@ visualizeSortKey (self, key)
      U8 *s, *e, *d;
      STRLEN klen, dlen;
      UV uv;
-    IV uca_vers;
+    IV uca_vers, sep = 0;
      static const char *upperhex = "0123456789ABCDEF";
    CODE:
      if (SvROK(self) && SvTYPE(SvRV(self)) == SVt_PVHV)
@@ -658,10 +662,13 @@ visualizeSortKey (self, key)
      s = (U8*)SvPV(key, klen);
  
     /* slightly *longer* than the need, but I'm afraid of miscounting;
-      exactly: (klen / 2) * 5 + MaxLevel * 2 - 1 (excluding '\0')
-         = (klen / 2) * 5 - 1  # FFFF (16bit) and ' ' between 16bit units
-         + (MaxLevel - 1) * 2  # ' ' and '|' for level boundaries
-         + 2                   # '[' and ']'
+      = (klen / 2) * 5 - 1
+             # FFFF and ' ' for each 16bit units but ' ' is less by 1;
+             # ' ' and '|' for level boundaries including the identical level
+       + 2   # '[' and ']'
+       + 1   # '\0'
+       (a) if klen is odd (not expected), maybe more 5 bytes.
+       (b) there is not always the identical level.
     */
      dlen = (klen / 2) * 5 + MaxLevel * 2 + 2;
      dst = newSV(dlen);
@@ -671,18 +678,18 @@ visualizeSortKey (self, key)
      *d++ = '[';
      for (e = s + klen; s < e; s += 2) {
         uv = (U16)(*s << 8 | s[1]);
-       if (uv) {
+       if (uv || sep >= MaxLevel) {
             if ((d[-1] != '[') && ((9 <= uca_vers) || (d[-1] != '|')))
                 *d++ = ' ';
             *d++ = upperhex[ (s[0] >> 4) & 0xF ];
             *d++ = upperhex[  s[0]       & 0xF ];
             *d++ = upperhex[ (s[1] >> 4) & 0xF ];
             *d++ = upperhex[  s[1]       & 0xF ];
-       }
-       else {
+       } else {
             if ((9 <= uca_vers) && (d[-1] != '['))
                 *d++ = ' ';
             *d++ = '|';
+           ++sep;
         }
      }
      *d++ = ']';
diff --git a/cpan/Unicode-Collate/README b/cpan/Unicode-Collate/README

index 5947585..26e8ff5 100644 (file)
--- a/cpan/Unicode-Collate/README
+++ b/cpan/Unicode-Collate/README
@@ -1,4 +1,4 @@
-Unicode/Collate version 0.91
+Unicode/Collate version 0.92
  ===============================
  
  NAME
diff --git a/cpan/Unicode-Collate/t/cjkrange.t b/cpan/Unicode-Collate/t/cjkrange.t

index 37fb9fd..e3d4f38 100644 (file)
--- a/cpan/Unicode-Collate/t/cjkrange.t
+++ b/cpan/Unicode-Collate/t/cjkrange.t
@@ -13,7 +13,7 @@ BEGIN {
  
  use strict;
  use warnings;
-BEGIN { $| = 1; print "1..379\n"; } # 1 + 42 x @Versions
+BEGIN { $| = 1; print "1..421\n"; } # 1 + 42 x @Versions
  my $count = 0;
  sub ok ($;$) {
      my $p = my $r = shift;
@@ -49,7 +49,7 @@ my $coll = Unicode::Collate->new(
  # 2A700..2B734 are CJK UI Ext.C since UCA_Version 20 (Unicode 5.2).
  # 2B740..2B81D are CJK UI Ext.D since UCA_Version 22 (Unicode 6.0).
  
-my @Versions = (8, 9, 11, 14, 16, 18, 20, 22, 24);
+my @Versions = (8, 9, 11, 14, 16, 18, 20, 22, 24, 26);
  
  for my $v (@Versions) {
      $coll->change(UCA_Version => $v);
diff --git a/cpan/Unicode-Collate/t/compatui.t b/cpan/Unicode-Collate/t/compatui.t

index 6fb01b8..822743e 100644 (file)
--- a/cpan/Unicode-Collate/t/compatui.t
+++ b/cpan/Unicode-Collate/t/compatui.t
@@ -13,7 +13,7 @@ BEGIN {
  
  use strict;
  use warnings;
-BEGIN { $| = 1; print "1..631\n"; } # 1 + 70 x @Versions
+BEGIN { $| = 1; print "1..701\n"; } # 1 + 70 x @Versions
  my $count = 0;
  sub ok ($;$) {
      my $p = my $r = shift;
@@ -30,7 +30,7 @@ ok(1);
  
  #########################
  
-my @Versions = (8, 9, 11, 14, 16, 18, 20, 22, 24);
+my @Versions = (8, 9, 11, 14, 16, 18, 20, 22, 24, 26);
  
  # 12 compatibility ideographs are treated as unified ideographs:
  # FA0E, FA0F, FA11, FA13, FA14, FA1F, FA21, FA23, FA24, FA27, FA28, FA29.
diff --git a/cpan/Unicode-Collate/t/hangtype.t b/cpan/Unicode-Collate/t/hangtype.t

index b85a308..5aa7d49 100644 (file)
--- a/cpan/Unicode-Collate/t/hangtype.t
+++ b/cpan/Unicode-Collate/t/hangtype.t
@@ -13,7 +13,7 @@ BEGIN {
  
  use strict;
  use warnings;
-BEGIN { $| = 1; print "1..451\n"; } # 1 + 50 x @Versions
+BEGIN { $| = 1; print "1..501\n"; } # 1 + 50 x @Versions
  my $count = 0;
  sub ok ($;$) {
      my $p = my $r = shift;
@@ -30,7 +30,7 @@ ok(1);
  
  #########################
  
-my @Versions = (8, 9, 11, 14, 16, 18, 20, 22, 24);
+my @Versions = (8, 9, 11, 14, 16, 18, 20, 22, 24, 26);
  
  for my $v (@Versions) {
      ok(Unicode::Collate::getHST(0x0000, $v), '');
diff --git a/cpan/Unicode-Collate/t/ident.t b/cpan/Unicode-Collate/t/ident.t

new file mode 100644 (file)

index 0000000..4f132d4
--- /dev/null
+++ b/cpan/Unicode-Collate/t/ident.t
@@ -0,0 +1,161 @@
+
+BEGIN {
+    unless ("A" eq pack('U', 0x41)) {
+       print "1..0 # Unicode::Collate " .
+           "cannot stringify a Unicode code point\n";
+       exit 0;
+    }
+    if ($ENV{PERL_CORE}) {
+       chdir('t') if -d 't';
+       @INC = $^O eq 'MacOS' ? qw(::lib) : qw(../lib);
+    }
+}
+
+use strict;
+use warnings;
+BEGIN { $| = 1; print "1..45\n"; }
+my $count = 0;
+sub ok ($;$) {
+    my $p = my $r = shift;
+    if (@_) {
+       my $x = shift;
+       $p = !defined $x ? !defined $r : !defined $r ? 0 : $r eq $x;
+    }
+    print $p ? "ok" : "not ok", ' ', ++$count, "\n";
+}
+
+use Unicode::Collate;
+
+ok(1);
+
+#########################
+
+my $Collator = Unicode::Collate->new(
+    table => 'keys.txt',
+    normalization => undef,
+);
+
+# [001F] UNIT SEPARATOR
+{
+    ok($Collator->eq("\0",   "\x1F"));
+    ok($Collator->eq("\x1F", "\x{200B}"));
+    ok($Collator->eq("\0",   "\x{200B}"));
+    ok($Collator->eq("\x{313}", "\x{343}"));
+    ok($Collator->eq("\x{2000}", "\x{2001}"));
+    ok($Collator->eq("\x{200B}", "\x{200C}"));
+    ok($Collator->eq("\x{304C}", "\x{304B}\x{3099}"));
+
+    $Collator->change(identical => 1);
+
+    ok($Collator->lt("\0",   "\x1F"));
+    ok($Collator->lt("\x1F", "\x{200B}"));
+    ok($Collator->lt("\0",   "\x{200B}"));
+    ok($Collator->lt("\x{313}", "\x{343}"));
+    ok($Collator->lt("\x{2000}", "\x{2001}"));
+    ok($Collator->lt("\x{200B}", "\x{200C}"));
+    ok($Collator->gt("\x{304C}", "\x{304B}\x{3099}"));
+
+    $Collator->change(identical => 0);
+
+    ok($Collator->eq("\0",   "\x1F"));
+    ok($Collator->eq("\x1F", "\x{200B}"));
+    ok($Collator->eq("\0",   "\x{200B}"));
+    ok($Collator->eq("\x{313}", "\x{343}"));
+    ok($Collator->eq("\x{2000}", "\x{2001}"));
+    ok($Collator->eq("\x{200B}", "\x{200C}"));
+    ok($Collator->eq("\x{304C}", "\x{304B}\x{3099}"));
+}
+
+#### 22
+
+eval { require Unicode::Normalize };
+if (!$@) {
+    $Collator->change(normalization => "NFD");
+
+    $Collator->change(identical => 1);
+
+    ok($Collator->lt("\0", "\x{200B}"));
+    ok($Collator->eq("\x{313}", "\x{343}"));
+    ok($Collator->lt("\x{2000}", "\x{2001}"));
+    ok($Collator->lt("\x{200B}", "\x{200C}"));
+    ok($Collator->eq("\x{304C}", "\x{304B}\x{3099}"));
+
+    $Collator->change(identical => 0);
+
+    ok($Collator->eq("\0", "\x{200B}"));
+    ok($Collator->eq("\x{313}", "\x{343}"));
+    ok($Collator->eq("\x{2000}", "\x{2001}"));
+    ok($Collator->eq("\x{200B}", "\x{200C}"));
+    ok($Collator->eq("\x{304C}", "\x{304B}\x{3099}"));
+} else {
+    ok(1) for 1..10;
+}
+
+$Collator->change(normalization => undef, identical => 1);
+
+##### 32
+
+ok($Collator->viewSortKey("\0"),       '[| | | | 0000 0000]');
+ok($Collator->viewSortKey("\x{200B}"), '[| | | | 0000 200B]');
+
+ok($Collator->viewSortKey('a'),
+    '[0A15 | 0020 | 0002 | FFFF | 0000 0061]');
+
+ok($Collator->viewSortKey("\x{304C}"),
+    '[1926 | 0020 013D | 000E 0002 | FFFF FFFF | 0000 304C]');
+
+ok($Collator->viewSortKey("\x{100000}"),
+    '[FBE0 8000 | 0020 | 0002 | FFFF FFFF | 0010 0000]');
+
+eval { require Unicode::Normalize };
+if (!$@) {
+    $Collator->change(normalization => "NFD");
+
+    ok($Collator->viewSortKey("\x{304C}"),
+    '[1926 | 0020 013D | 000E 0002 | FFFF FFFF | 0000 304B 0000 3099]');
+} else {
+    ok(1);
+}
+
+$Collator->change(normalization => undef);
+
+##### 38
+
+$Collator->change(level => 3);
+
+ok($Collator->viewSortKey("\x{304C}"),
+    '[1926 | 0020 013D | 000E 0002 | | 0000 304C]');
+
+$Collator->change(level => 2);
+
+ok($Collator->viewSortKey("\x{304C}"),
+    '[1926 | 0020 013D | | | 0000 304C]');
+
+$Collator->change(level => 1);
+
+ok($Collator->viewSortKey("\x{304C}"),
+    '[1926 | | | | 0000 304C]');
+
+##### 41
+
+$Collator->change(UCA_Version => 8);
+
+ok($Collator->viewSortKey("\x{304C}"),
+    '[1926||||0000 304C]');
+
+$Collator->change(level => 2);
+
+ok($Collator->viewSortKey("\x{304C}"),
+    '[1926|0020 013D|||0000 304C]');
+
+$Collator->change(level => 3);
+
+ok($Collator->viewSortKey("\x{304C}"),
+    '[1926|0020 013D|000E 0002||0000 304C]');
+
+$Collator->change(level => 4);
+
+ok($Collator->viewSortKey("\x{304C}"),
+    '[1926|0020 013D|000E 0002|FFFF FFFF|0000 304C]');
+
+##### 45
diff --git a/cpan/Unicode-Collate/t/index.t b/cpan/Unicode-Collate/t/index.t

index b3433a9..11cf618 100644 (file)
--- a/cpan/Unicode-Collate/t/index.t
+++ b/cpan/Unicode-Collate/t/index.t
@@ -13,7 +13,7 @@ BEGIN {
  
  use strict;
  use warnings;
-BEGIN { $| = 1; print "1..71\n"; }
+BEGIN { $| = 1; print "1..91\n"; }
  my $count = 0;
  sub ok ($;$) {
      my $p = my $r = shift;
@@ -28,8 +28,6 @@ use Unicode::Collate;
  
  ok(1);
  
-#########################
-
  our $IsEBCDIC = ord("A") != 0x41;
  
  my $Collator = Unicode::Collate->new(
@@ -37,7 +35,7 @@ my $Collator = Unicode::Collate->new(
    normalization => undef,
  );
  
-##############
+##### 1
  
  my %old_level = $Collator->change(level => 2);
  
@@ -64,7 +62,7 @@ if (my($pos,$len) = $Collator->index($str, $sub)) {
  
  ok($str, $orig);
  
-##############
+##### 3
  
  my $match;
  
@@ -126,7 +124,7 @@ if (my($pos, $len) = $Collator->index($str, $sub)) {
  }
  ok($match, $ret);
  
-##############
+##### 9
  
  $Collator->change(level => 1);
  
@@ -165,7 +163,7 @@ if (my($pos,$len) = $Collator->index("", "abc")) {
  }
  ok($match, undef);
  
-##############
+##### 13
  
  $Collator->change(level => 1);
  
@@ -201,7 +199,7 @@ if (my($pos, $len) = $Collator->index($str, $sub)) {
  }
  ok($match, $ret);
  
-##############
+##### 16
  
  $Collator->change(level => 1);
  
@@ -246,7 +244,7 @@ ok($match, undef);
  
  $Collator->change(%old_level);
  
-##############
+##### 22
  
  my @ret;
  
@@ -318,7 +316,7 @@ ok($ret, undef);
  
  $Collator->change(%old_level);
  
-##############
+##### 38
  
  $Collator->change(level => 1);
  
@@ -349,6 +347,8 @@ $Collator->gsubst($str, "camel", sub { "<b>$_[0]</b>" });
  ok($str, "<b>Camel</b> donkey zebra <b>came\x{301}l</b> "
         . "<b>CAMEL</b> horse <b>cAm\0E\0L</b>...");
  
+##### 47
+
  # http://www.xray.mpe.mpg.de/mailing-lists/perl-unicode/2010-09/msg00014.html
  # when the substring includes an ignorable element like a space...
  
@@ -376,6 +376,8 @@ $str = "Camel donkey zebra camex{301}l CAMEL horse cAmEL-horse...";
  $Collator->gsubst($str, "ca\x{300}melho\x{302}rse", sub { "=$_[0]=" });
  ok($str, "Camel donkey zebra camex{301}l =CAMEL horse= =cAmEL-horse=...");
  
+##### 53
+
  $Collator->change(level => 3);
  
  $str = "P\cBe\x{300}\cBrl and PERL.";
@@ -400,7 +402,7 @@ ok($str, "P\cBe\x{300}\cBrl and PERL.");
  
  $Collator->change(%old_level);
  
-##############
+##### 61
  
  $str = "Perl and Camel";
  $ret = $Collator->gsubst($str, "\cA\cA\0", "AB");
@@ -422,7 +424,7 @@ $ret = $Collator->gsubst($str, 'PP', "ABC");
  ok($ret, 2);
  ok($str, "ABCABCP");
  
-##############
+##### 69
  
  # Shifted; ignorable after variable
  
@@ -434,3 +436,99 @@ $Collator->change(alternate => 'Non-ignorable');
  ($ret) = $Collator->match("A?\x{300}!\x{301}B\x{315}", "?!");
  ok($ret, undef);
  
+##### 71
+
+# Now preprocess is defined.
+
+$Collator->change(preprocess => sub {''});
+
+eval { $Collator->index("", "") };
+ok($@ && $@ =~ /Don't use Preprocess with index\(\)/);
+
+eval { $Collator->index("a", "a") };
+ok($@ && $@ =~ /Don't use Preprocess with index\(\)/);
+
+eval { $Collator->match("", "") };
+ok($@ && $@ =~ /Don't use Preprocess with.*match\(\)/);
+
+eval { $Collator->match("a", "a") };
+ok($@ && $@ =~ /Don't use Preprocess with.*match\(\)/);
+
+$Collator->change(preprocess => sub { uc shift });
+
+eval { $Collator->index("", "") };
+ok($@ && $@ =~ /Don't use Preprocess with index\(\)/);
+
+eval { $Collator->index("a", "a") };
+ok($@ && $@ =~ /Don't use Preprocess with index\(\)/);
+
+eval { $Collator->match("", "") };
+ok($@ && $@ =~ /Don't use Preprocess with.*match\(\)/);
+
+eval { $Collator->match("a", "a") };
+ok($@ && $@ =~ /Don't use Preprocess with.*match\(\)/);
+
+##### 79
+
+eval { require Unicode::Normalize };
+my $has_norm = !$@;
+
+if ($has_norm) {
+    # Now preprocess and normalization are defined.
+
+    $Collator->change(normalization => 'NFD');
+
+    eval { $Collator->index("", "") };
+    ok($@ && $@ =~ /Don't use Preprocess with index\(\)/);
+
+    eval { $Collator->index("a", "a") };
+    ok($@ && $@ =~ /Don't use Preprocess with index\(\)/);
+
+    eval { $Collator->match("", "") };
+    ok($@ && $@ =~ /Don't use Preprocess with.*match\(\)/);
+
+    eval { $Collator->match("a", "a") };
+    ok($@ && $@ =~ /Don't use Preprocess with.*match\(\)/);
+} else {
+    ok(1) for 1..4;
+}
+
+$Collator->change(preprocess => undef);
+
+if ($has_norm) {
+    # Now only normalization is defined.
+
+    eval { $Collator->index("", "") };
+    ok($@ && $@ =~ /Don't use Normalization with index\(\)/);
+
+    eval { $Collator->index("a", "a") };
+    ok($@ && $@ =~ /Don't use Normalization with index\(\)/);
+
+    eval { $Collator->match("", "") };
+    ok($@ && $@ =~ /Don't use Normalization with.*match\(\)/);
+
+    eval { $Collator->match("a", "a") };
+    ok($@ && $@ =~ /Don't use Normalization with.*match\(\)/);
+
+    $Collator->change(normalization => undef);
+} else {
+    ok(1) for 1..4;
+}
+
+##### 87
+
+# Now preprocess and normalization are undef.
+
+eval { $Collator->index("", "") };
+ok(!$@);
+
+eval { $Collator->index("a", "a") };
+ok(!$@);
+
+eval { $Collator->match("", "") };
+ok(!$@);
+
+eval { $Collator->match("a", "a") };
+ok(!$@);
+
+##### 91
diff --git a/cpan/Unicode-Collate/t/overcjk0.t b/cpan/Unicode-Collate/t/overcjk0.t

index 588e8a8..081f57b 100644 (file)
--- a/cpan/Unicode-Collate/t/overcjk0.t
+++ b/cpan/Unicode-Collate/t/overcjk0.t
@@ -13,7 +13,7 @@ BEGIN {
  
  use strict;
  use warnings;
-BEGIN { $| = 1; print "1..285\n"; } # 6 + 31 x @Versions
+BEGIN { $| = 1; print "1..316\n"; } # 6 + 31 x @Versions
  my $count = 0;
  sub ok ($;$) {
      my $p = my $r = shift;
@@ -62,7 +62,7 @@ ok($ignoreCJK->lt("Pe\x{5B57}rl", "Perl")); # 'r' is unassigned.
  # 2A700..2B734 are CJK UI Ext.C since UCA_Version 20 (Unicode 5.2).
  # 2B740..2B81D are CJK UI Ext.D since UCA_Version 22 (Unicode 6.0).
  
-my @Versions = (8, 9, 11, 14, 16, 18, 20, 22, 24);
+my @Versions = (8, 9, 11, 14, 16, 18, 20, 22, 24, 26);
  
  for my $v (@Versions) {
      $ignoreCJK->change(UCA_Version => $v);
diff --git a/cpan/Unicode-Collate/t/overcjk1.t b/cpan/Unicode-Collate/t/overcjk1.t

index dc3ae8f..7bee176 100644 (file)
--- a/cpan/Unicode-Collate/t/overcjk1.t
+++ b/cpan/Unicode-Collate/t/overcjk1.t
@@ -13,7 +13,7 @@ BEGIN {
  
  use strict;
  use warnings;
-BEGIN { $| = 1; print "1..155\n"; } # 11 + 16 x @Versions
+BEGIN { $| = 1; print "1..171\n"; } # 11 + 16 x @Versions
  my $count = 0;
  sub ok ($;$) {
      my $p = my $r = shift;
@@ -62,7 +62,7 @@ ok($overCJK->lt("a\x{4E03}", "A\x{4E01}"));
  # 9FC4..9FCB are CJK UI since UCA_Version 20 (Unicode 5.2).
  # 9FCC       is  CJK UI since UCA_Version 24 (Unicode 6.1).
  
-my @Versions = (8, 9, 11, 14, 16, 18, 20, 22, 24);
+my @Versions = (8, 9, 11, 14, 16, 18, 20, 22, 24, 26);
  
  for my $v (@Versions) {
      $overCJK->change(UCA_Version => $v);
diff --git a/cpan/Unicode-Collate/t/test.t b/cpan/Unicode-Collate/t/test.t

index 440c3a9..552440f 100644 (file)
--- a/cpan/Unicode-Collate/t/test.t
+++ b/cpan/Unicode-Collate/t/test.t
@@ -13,7 +13,7 @@ BEGIN {
  
  use strict;
  use warnings;
-BEGIN { $| = 1; print "1..107\n"; }
+BEGIN { $| = 1; print "1..112\n"; }
  my $count = 0;
  sub ok ($;$) {
      my $p = my $r = shift;
@@ -385,5 +385,36 @@ $_ = 'Foo';
  @temp = $c->index("perl5", "LR");
  ok($_, 'Foo');
  
-#####
+##### 108..109
+
+{
+    my $caseless = Unicode::Collate->new(
+       table => "keys.txt",
+       normalization => undef,
+       preprocess => sub { uc shift },
+    );
+    ok( $Collator->gt("ABC","abc") );
+    ok( $caseless->eq("ABC","abc") );
+}
  
+##### 110..112
+
+{
+    eval { require Unicode::Normalize; };
+    if ($@) {
+       eval { my $n1 = Unicode::Collate->new(table => "keys.txt"); };
+        ok($@ =~ /Unicode::Normalize is required/);
+
+       eval { my $n2 = Unicode::Collate->new
+               (table => "keys.txt", normalization => undef); };
+       ok(!$@);
+
+       eval { my $n3 = Unicode::Collate->new
+               (table => "keys.txt", normalization => 'prenormalized'); };
+        ok($@ =~ /Unicode::Normalize is required/);
+    } else {
+       ok(1) for 1..3;
+    }
+}
+
+#####
diff --git a/cpan/Unicode-Collate/t/view.t b/cpan/Unicode-Collate/t/view.t

index 6f7c0fb..4759533 100644 (file)
--- a/cpan/Unicode-Collate/t/view.t
+++ b/cpan/Unicode-Collate/t/view.t
@@ -13,7 +13,7 @@ BEGIN {
  
  use strict;
  use warnings;
-BEGIN { $| = 1; print "1..53\n"; }
+BEGIN { $| = 1; print "1..89\n"; }
  my $count = 0;
  sub ok ($;$) {
      my $p = my $r = shift;
@@ -28,15 +28,14 @@ use Unicode::Collate;
  
  ok(1);
  
-#########################
+##### 1
  
  my $Collator = Unicode::Collate->new(
    table => 'keys.txt',
    normalization => undef,
+  UCA_Version => 24,
  );
  
-##############
-
  ok($Collator->viewSortKey(""), "[| | |]");
  
  ok($Collator->viewSortKey("A"), "[0A15 | 0020 | 0008 | FFFF]");
@@ -60,7 +59,7 @@ ok($Collator->viewSortKey("A"), "[0A15 | 0020 | |]");
  $Collator->change(level => 1);
  ok($Collator->viewSortKey("A"), "[0A15 | | |]");
  
-### Version 8
+##### 10
  
  $Collator->change(level => 4, UCA_Version => 8);
  
@@ -87,7 +86,7 @@ ok($Collator->viewSortKey("A"), "[0A15|0020||]");
  $Collator->change(level => 1);
  ok($Collator->viewSortKey("A"), "[0A15|||]");
  
-# Version 9
+##### 19
  
  $Collator->change(level => 3, UCA_Version => 9);
  ok($Collator->viewSortKey("A\x{300}z\x{301}"),
@@ -156,7 +155,7 @@ ok($Collator->viewSortKey("?!."), '[| | | 024E 024B 0255]');
  
  $Collator->change(%origVar);
  
-#####
+##### 37
  
  # Level 3 weight
  
@@ -197,7 +196,7 @@ ok($Collator->viewSortKey("a\x{3042}"),
  ok($Collator->viewSortKey("A\x{30A2}"),
      '[0A15 1921 | 0020 0020 | 0008 0011 | FFFF FFFF]');
  
-#####
+##### 47
  
  our $el = Unicode::Collate->new(
    entry => <<'ENTRY',
@@ -214,6 +213,7 @@ FF2C ; [.0B03.0020.0009.FF2C] # FULLWIDTH LATIN CAPITAL LETTER L; QQK
  ENTRY
    table => undef,
    normalization => undef,
+  UCA_Version => 24,
  );
  
  our $el12 = '0B03 0B03 0B03 0B03 0B03 | 0020 0020 0020 0020 0020';
@@ -240,5 +240,30 @@ ok($el->viewSortKey("l\x{FF4C}\x{217C}\x{2113}\x{24DB}"),
  ok($el->viewSortKey("L\x{FF2C}\x{216C}\x{2112}\x{24C1}"),
      "[$el12 | 0008 0009 000A 000B 000C | FFFF FFFF FFFF FFFF FFFF]");
  
-#####
+##### 53
+
+my @Versions = (9, 11, 14, 16, 18, 20, 22, 24, 26);
+
+for my $v (@Versions) {
+    $Collator->change(UCA_Version => $v);
+    my $app = $v >= 26 ? ' |]' : ']';
+
+    $Collator->change(variable => 'Shifted', level => 4);
+    ok($Collator->viewSortKey("1+2"),
+       '[0A0C 0A0D | 0020 0020 | 0002 0002 | FFFF 039F FFFF'.$app);
+
+    $Collator->change(variable => 'Shift-Trimmed');
+    ok($Collator->viewSortKey("1+2"),
+       '[0A0C 0A0D | 0020 0020 | 0002 0002 | 039F'.$app);
+
+    $Collator->change(variable => 'Non-ignorable', level => 3);
+    ok($Collator->viewSortKey("1+2"),
+       '[0A0C 039F 0A0D | 0020 0020 0020 | 0002 0002 0002 |]');
+
+    $Collator->change(variable => 'Blanked');
+    ok($Collator->viewSortKey("1+2"),
+       '[0A0C 0A0D | 0020 0020 | 0002 0002 |]');
+}
+
+##### 89
author	Chris 'BinGOs' Williams <chris@bingosnet.co.uk>
	Wed, 14 Nov 2012 14:46:15 +0000 (14:46 +0000)
committer	Chris 'BinGOs' Williams <chris@bingosnet.co.uk>
	Wed, 14 Nov 2012 14:46:15 +0000 (14:46 +0000)
MANIFEST		patch \| blob \| history
Porting/Maintainers.pl		patch \| blob \| history
cpan/Unicode-Collate/Changes		patch \| blob \| history
cpan/Unicode-Collate/Collate.pm		patch \| blob \| history
cpan/Unicode-Collate/Collate.xs		patch \| blob \| history
cpan/Unicode-Collate/README		patch \| blob \| history
cpan/Unicode-Collate/t/cjkrange.t		patch \| blob \| history
cpan/Unicode-Collate/t/compatui.t		patch \| blob \| history
cpan/Unicode-Collate/t/hangtype.t		patch \| blob \| history
cpan/Unicode-Collate/t/ident.t	[new file with mode: 0644]	patch \| blob
cpan/Unicode-Collate/t/index.t		patch \| blob \| history
cpan/Unicode-Collate/t/overcjk0.t		patch \| blob \| history
cpan/Unicode-Collate/t/overcjk1.t		patch \| blob \| history
cpan/Unicode-Collate/t/test.t		patch \| blob \| history
cpan/Unicode-Collate/t/view.t		patch \| blob \| history