require Exporter;
-our $VERSION = '0.09';
+our $VERSION = '0.10';
our $PACKAGE = __PACKAGE__;
our @ISA = qw(Exporter);
$self->{rearrange} ||= []; # maybe not U+0000 (an ASCII)
$self->{rearrange} = [ $self->{rearrange} ] if ! ref $self->{rearrange};
- # open the table file
- my $file = defined $self->{table} ? $self->{table} : $KeyFile;
+ # open a table file.
+ # if undef is passed explicitly, no file is read.
+ $self->{table} = $KeyFile unless exists $self->{table};
+ $self->read_table if defined $self->{table};
+
+ if($self->{entry}){
+ $self->parseEntry($_) foreach split /\n/, $self->{entry};
+ }
+
+ # keys of $self->{rearrangeHash} are $self->{rearrange}.
+ $self->{rearrangeHash} = {};
+ @{ $self->{rearrangeHash} }{ @{ $self->{rearrange} } } = ();
+
+ return $self;
+}
+
+
+sub read_table {
+ my $self = shift;
+ my $file = $self->{table} ne '' ? $self->{table} : $KeyFile;
open my $fk, "<$Path/$file" or croak "File does not exist at $Path/$file";
while(<$fk>){
$self->parseEntry($_);
}
close $fk;
- if($self->{entry}){
- $self->parseEntry($_) foreach split /\n/, $self->{entry};
- }
-
- # keys of $self->{rearrangeHash} are $self->{rearrange}.
- $self->{rearrangeHash} = {};
- @{ $self->{rearrangeHash} }{ @{ $self->{rearrange} } } = ();
-
- return $self;
}
+
##
## get $line, parse it, and write an entry in $self
##
my $u = $src[$i];
# non-characters
+ next unless defined $u;
next if $u < 0 || 0x10FFFF < $u # out of range
|| 0xD800 < $u && $u < 0xDFFF; # unpaired surrogates
my $four = $u & 0xFFFF;
# with Combining Char (UTS#10, 4.2.1), here requires Unicode::Normalize.
if($getCombinClass && defined $ch)
{
- for(my $j = $i+1; $j < @src && $getCombinClass->( $src[$j] ); $j++)
+ for(my $j = $i+1; $j < @src; $j++)
{
+ next unless defined $src[$j];
+ last unless $getCombinClass->( $src[$j] );
my $comb = pack 'U', $src[$j];
next if ! $ent->{ $ch.$comb };
$ch .= $comb;
- splice(@src, $j, 1);
+ $src[$j] = undef;
}
}
push @buf, $ch;
-- see 3.1 Linguistic Features; 3.2.1 File Format, UTR #10.
-Overrides a default order or adds a new element
+Overrides a default order or adds a new collation element
entry => <<'ENTRIES', # use the UCA file format
00E6 ; [.0861.0020.0002.00E6] [.08B1.0020.0002.00E6] # ligature <ae> as <a e>
-- see 4.1 Normalize each input string, UTR #10.
-If specified, strings are normalized before preparation sort keys
+If specified, strings are normalized before preparation of sort keys
(the normalization is executed after preprocess).
As a form name, one of the following names must be used.
By default, the file C<lib/Unicode/Collate/allkeys.txt> is used.
+If undefined explicitly (as C<table =E<gt> undef>),
+no file is read (you'd define collation elements using L<entry>).
+
=item undefName
=item undefChar
#########################
use Test;
-BEGIN { plan tests => 50 };
+BEGIN { plan tests => 54 };
use Unicode::Collate;
ok(1); # If we made it this far, we're ok.
if(!$@){
my $NFD = Unicode::Collate->new(
table => 'keys.txt',
+ entry => <<'ENTRIES',
+0430 ; [.0B01.0020.0002.0430] # CYRILLIC SMALL LETTER A
+0410 ; [.0B01.0020.0008.0410] # CYRILLIC CAPITAL LETTER A
+04D3 ; [.0B09.0020.0002.04D3] # CYRILLIC SMALL LETTER A WITH DIAERESIS
+0430 0308 ; [.0B09.0020.0002.04D3] # CYRILLIC SMALL LETTER A WITH DIAERESIS
+04D3 ; [.0B09.0020.0002.04D3] # CYRILLIC SMALL LETTER A WITH DIAERESIS
+0430 0308 ; [.0B09.0020.0002.04D3] # CYRILLIC SMALL LETTER A WITH DIAERESIS
+04D2 ; [.0B09.0020.0008.04D2] # CYRILLIC CAPITAL LETTER A WITH DIAERESIS
+0410 0308 ; [.0B09.0020.0008.04D2] # CYRILLIC CAPITAL LETTER A WITH DIAERESIS
+0430 3099 ; [.0B10.0020.0002.04D3] # A WITH KATAKANA VOICED
+0430 3099 0308 ; [.0B11.0020.0002.04D3] # A WITH KATAKANA VOICED, DIAERESIS
+ENTRIES
);
- ok($NFD->cmp("A$acute", $A_acute), 0);
+ ok($NFD->eq("A$acute", $A_acute));
+ ok($NFD->eq("\x{4D3}\x{325}", "\x{430}\x{308}\x{325}"));
+ ok($NFD->lt("\x{430}\x{308}A", "\x{430}\x{308}B"));
+ ok($NFD->lt("\x{430}\x{3099}B", "\x{430}\x{308}\x{3099}A"));
+ ok($NFD->eq("\x{0430}\x{3099}\x{309A}\x{0308}",
+ "\x{0430}\x{309A}\x{3099}\x{0308}") );
}
else{
ok(1);
+ ok(1);
+ ok(1);
+ ok(1);
+ ok(1);
}
my $tr = Unicode::Collate->new(