From f58b9ef133ec1792309e75435d4b73428cef3ea2 Mon Sep 17 00:00:00 2001 From: Chris 'BinGOs' Williams Date: Sun, 23 Jan 2011 16:48:32 +0000 Subject: [PATCH] Update Unicode-Collate to CPAN version 0.72 Second attempt to integrate the XS version of Unicode::Collate into core. [DELTA] 0.72 Sat Jan 22 17:28:32 2011 - xs: fix mixing char* and U8*. 0.71 Tue Jan 18 22:29:44 2011 - t/loc_test.t should not fail without Unicode::Normalize. 0.70 Sun Jan 16 20:31:07 2011 - Now U::C::Locale->new will use the compiled DUCET via XS if available. added some tests in t/loc_test.t. 0.69 Sat Jan 15 19:41:11 2011 - clarified about XSUB. revised INSTALL in README. - xs: flag passed to utf8n_to_uvuni(). - doc and comments: [perl #81876] Fix typos by Peter J. Acklam. --- MANIFEST | 3 + Porting/Maintainers.pl | 2 +- cpan/Unicode-Collate/.gitignore | 1 + cpan/Unicode-Collate/Changes | 25 +- cpan/Unicode-Collate/Collate.pm | 360 +++-------------- cpan/Unicode-Collate/Collate.xs | 691 +++++++++++++++++++++++++++++++++ cpan/Unicode-Collate/Collate/Locale.pm | 8 +- cpan/Unicode-Collate/Makefile.PL | 28 ++ cpan/Unicode-Collate/README | 17 +- cpan/Unicode-Collate/mkheader | 196 ++++++++++ cpan/Unicode-Collate/t/loc_test.t | 13 +- pod/perldelta.pod | 7 + 12 files changed, 1020 insertions(+), 331 deletions(-) create mode 100644 cpan/Unicode-Collate/.gitignore create mode 100644 cpan/Unicode-Collate/Collate.xs create mode 100644 cpan/Unicode-Collate/Makefile.PL create mode 100644 cpan/Unicode-Collate/mkheader diff --git a/MANIFEST b/MANIFEST index 05ec676..9842238 100644 --- a/MANIFEST +++ b/MANIFEST @@ -2554,6 +2554,9 @@ cpan/Unicode-Collate/Collate/Locale/zh_pin.pl Unicode::Collate cpan/Unicode-Collate/Collate/Locale/zh.pl Unicode::Collate cpan/Unicode-Collate/Collate/Locale/zh_strk.pl Unicode::Collate cpan/Unicode-Collate/Collate.pm Unicode::Collate +cpan/Unicode-Collate/Collate.xs Unicode::Collate +cpan/Unicode-Collate/Makefile.PL Unicode::Collate +cpan/Unicode-Collate/mkheader Unicode::Collate cpan/Unicode-Collate/README Unicode::Collate cpan/Unicode-Collate/t/altern.t Unicode::Collate cpan/Unicode-Collate/t/backwds.t Unicode::Collate diff --git a/Porting/Maintainers.pl b/Porting/Maintainers.pl index ad040d5..f41a267 100755 --- a/Porting/Maintainers.pl +++ b/Porting/Maintainers.pl @@ -1577,7 +1577,7 @@ use File::Glob qw(:case); 'Unicode::Collate' => { 'MAINTAINER' => 'sadahiro', - 'DISTRIBUTION' => 'SADAHIRO/Unicode-Collate-0.70-withoutworldwriteables.tar.gz', + 'DISTRIBUTION' => 'SADAHIRO/Unicode-Collate-0.72-withoutworldwriteables.tar.gz', 'FILES' => q[cpan/Unicode-Collate], 'EXCLUDED' => [ qr{N$}, qr{^data/}, diff --git a/cpan/Unicode-Collate/.gitignore b/cpan/Unicode-Collate/.gitignore new file mode 100644 index 0000000..424c745 --- /dev/null +++ b/cpan/Unicode-Collate/.gitignore @@ -0,0 +1 @@ +*.h diff --git a/cpan/Unicode-Collate/Changes b/cpan/Unicode-Collate/Changes index ca9be54..816d43a 100644 --- a/cpan/Unicode-Collate/Changes +++ b/cpan/Unicode-Collate/Changes @@ -1,5 +1,20 @@ Revision history for Perl module Unicode::Collate. +0.72 Sat Jan 22 17:28:32 2011 + - xs: fix mixing char* and U8*. + +0.71 Tue Jan 18 22:29:44 2011 + - t/loc_test.t should not fail without Unicode::Normalize. + +0.70 Sun Jan 16 20:31:07 2011 + - Now U::C::Locale->new will use the compiled DUCET via XS if available. + added some tests in t/loc_test.t. + +0.69 Sat Jan 15 19:41:11 2011 + - clarified about XSUB. revised INSTALL in README. + - xs: flag passed to utf8n_to_uvuni(). + - doc and comments: [perl #81876] Fix typos by Peter J. Acklam. + 0.68 Tue Nov 23 20:17:22 2010 - doc: clarified about (backwards => [ ]) and (backwards => undef). - separated t/backwds.t from t/test.t. @@ -24,7 +39,7 @@ Revision history for Perl module Unicode::Collate. - 12 compat. ideographs (e.g. U+FA0E) are treated as unified ideographs. (though DUCET also does it, now Unicode::Collate does it without DUCET.) - added t/compatui.t. - ! Ideographs Ext.B (U+20000..U+2A6D6) can be overrided with UCA_Version 8. + ! Ideographs Ext.B (U+20000..U+2A6D6) can be overridden with UCA_Version 8. This is a long-standing behavior from Unicode::Collate 0.11 to 0.63. A wrong fix at 0.64 should be abandoned. @@ -121,6 +136,8 @@ Revision history for Perl module Unicode::Collate. - U+9FC4..U+9FCB and U+2A700..U+2B734 are new CJK unified ideographs. - Many hangul jamo are assigned (affecting hangul_terminator). + ! Now XSUB will be built by default. (XSUB needs a C compiler.) + To build pure perl, run disableXS before Makefile.PL. ! DUCET will be compiled when XS is used. Explicit saying 'allkeys.txt'> (or using another table) will prevent this module from using the compiled DUCET. @@ -174,11 +191,11 @@ Revision history for Perl module Unicode::Collate. (Perl 5.7.3 or before)). If perl 5.6.X is used, XSUB may help it in place of broken CORE::unpack('U*') in older perl. - added illegal.t and illegalp.t in t. - - added XSUB (EXPERIMENTAL!) where some functions are implemented - in XSUB. Pure Perl is also supported. + - added XSUB where some functions are implemented in XSUB. + Pure Perl is also supported. 0.30 Mon Oct 13 21:26:37 2003 - - fix: Completely ignorable in table should be able to be overrided + - fix: Completely ignorable in table should be able to be overridden by non-ignorable in entry. - fix: Maximum length for contraction must not be shortened by a shorter contraction following in table and/or entry. diff --git a/cpan/Unicode-Collate/Collate.pm b/cpan/Unicode-Collate/Collate.pm index b337b6f..c3ed1c7 100644 --- a/cpan/Unicode-Collate/Collate.pm +++ b/cpan/Unicode-Collate/Collate.pm @@ -14,9 +14,13 @@ use File::Spec; no warnings 'utf8'; -our $VERSION = '0.6801'; +our $VERSION = '0.72'; our $PACKAGE = __PACKAGE__; +require DynaLoader; +our @ISA = qw(DynaLoader); +bootstrap Unicode::Collate $VERSION; + my @Path = qw(Unicode Collate); my $KeyFile = "allkeys.txt"; @@ -71,49 +75,8 @@ use constant NON_VAR => 0; # Non-Variable character use constant VAR => 1; # Variable character # specific code points -use constant Hangul_SBase => 0xAC00; use constant Hangul_SIni => 0xAC00; use constant Hangul_SFin => 0xD7A3; -use constant Hangul_NCount => 588; -use constant Hangul_TCount => 28; -use constant Hangul_LBase => 0x1100; -use constant Hangul_LIni => 0x1100; -use constant Hangul_LFin => 0x1159; -use constant Hangul_LFill => 0x115F; -use constant Hangul_LEnd => 0x115F; # Unicode 5.2 -use constant Hangul_VBase => 0x1161; -use constant Hangul_VIni => 0x1160; # from Vowel Filler -use constant Hangul_VFin => 0x11A2; -use constant Hangul_VEnd => 0x11A7; # Unicode 5.2 -use constant Hangul_TBase => 0x11A7; # from "no-final" codepoint -use constant Hangul_TIni => 0x11A8; -use constant Hangul_TFin => 0x11F9; -use constant Hangul_TEnd => 0x11FF; # Unicode 5.2 -use constant HangulL2Ini => 0xA960; # Unicode 5.2 -use constant HangulL2Fin => 0xA97C; # Unicode 5.2 -use constant HangulV2Ini => 0xD7B0; # Unicode 5.2 -use constant HangulV2Fin => 0xD7C6; # Unicode 5.2 -use constant HangulT2Ini => 0xD7CB; # Unicode 5.2 -use constant HangulT2Fin => 0xD7FB; # Unicode 5.2 - -use constant CJK_UidIni => 0x4E00; -use constant CJK_UidFin => 0x9FA5; -use constant CJK_UidF41 => 0x9FBB; -use constant CJK_UidF51 => 0x9FC3; -use constant CJK_UidF52 => 0x9FCB; -use constant CJK_ExtAIni => 0x3400; # Unicode 3.0 -use constant CJK_ExtAFin => 0x4DB5; # Unicode 3.0 -use constant CJK_ExtBIni => 0x20000; # Unicode 3.1 -use constant CJK_ExtBFin => 0x2A6D6; # Unicode 3.1 -use constant CJK_ExtCIni => 0x2A700; # Unicode 5.2 -use constant CJK_ExtCFin => 0x2B734; # Unicode 5.2 -use constant CJK_ExtDIni => 0x2B740; # Unicode 6.0 -use constant CJK_ExtDFin => 0x2B81D; # Unicode 6.0 - -my %CompatUI = map +($_ => 1), ( - 0xFA0E, 0xFA0F, 0xFA11, 0xFA13, 0xFA14, 0xFA1F, - 0xFA21, 0xFA23, 0xFA24, 0xFA27, 0xFA28, 0xFA29, -); # Logical_Order_Exception in PropList.txt my $DefaultRearrange = [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ]; @@ -128,10 +91,6 @@ sub pack_U { return pack('U*', @_); } -sub unpack_U { - return unpack('U*', shift(@_).pack('U*')); -} - ###### my (%VariableOK); @@ -152,6 +111,7 @@ our @ChangeNG = qw/ versionTable alternateTable backwardsTable forwardsTable rearrangeTable derivCode normCode rearrangeHash backwardsFlag suppress suppressHash + __useXS /; # The hash key 'ignored' is deleted at v 0.21. # The hash key 'isShift' is deleted at v 0.23. @@ -285,6 +245,12 @@ sub new my $class = shift; my $self = bless { @_ }, $class; + if (! exists $self->{table} && + !defined $self->{undefName} && !defined $self->{ignoreName} && + !defined $self->{undefChar} && !defined $self->{ignoreChar}) { + $self->{__useXS} = \&_fetch_simple; + } # XS only + # keys of $self->{suppressHash} are $self->{suppress}. if ($self->{suppress} && @{ $self->{suppress} }) { @{ $self->{suppressHash} }{ @{ $self->{suppress} } } = (); @@ -347,6 +313,20 @@ sub parseAtmark { sub read_table { my $self = shift; + if ($self->{__useXS}) { + my @rest = _fetch_rest(); # complex matter need to parse + for my $line (@rest) { + next if $line =~ /^\s*#/; + + if ($line =~ s/^\s*\@//) { + $self->parseAtmark($line); + } else { + $self->parseEntry($line); + } + } + return; + } + my($f, $fh); foreach my $d (@INC) { $f = File::Spec->catfile($d, @Path, $self->{table}); @@ -445,50 +425,12 @@ sub parseEntry } -## -## VCE = _varCE(variable, VCE) -## -sub _varCE -{ - my $vbl = shift; - my $vce = shift; - if ($vbl eq 'non-ignorable') { - return $vce; - } - my ($var, @wt) = unpack VCE_TEMPLATE, $vce; - - if ($var) { - return pack(VCE_TEMPLATE, $var, 0, 0, 0, - $vbl eq 'blanked' ? $wt[3] : $wt[0]); - } - elsif ($vbl eq 'blanked') { - return $vce; - } - else { - return pack(VCE_TEMPLATE, $var, @wt[0..2], - $vbl eq 'shifted' && $wt[0]+$wt[1]+$wt[2] ? Shift4Wt : 0); - } -} - sub viewSortKey { my $self = shift; $self->visualizeSortKey($self->getSortKey(@_)); } -sub visualizeSortKey -{ - my $self = shift; - my $view = join " ", map sprintf("%04X", $_), unpack(KEY_TEMPLATE, shift); - - if ($self->{UCA_Version} <= 8) { - $view =~ s/ ?0000 ?/|/g; - } else { - $view =~ s/\b0000\b/|/g; - } - return "[$view]"; -} - ## ## arrayref of JCPS = splitEnt(string to be collated) @@ -506,6 +448,7 @@ sub splitEnt my $reH = $self->{rearrangeHash}; my $vers = $self->{UCA_Version}; my $ver9 = $vers >= 9 && $vers <= 11; + my $uXS = $self->{__useXS}; my ($str, @buf); @@ -544,6 +487,9 @@ sub splitEnt } elsif ($ver9) { $src[$i] = undef if $map->{ $src[$i] } && @{ $map->{ $src[$i] } } == 0; + if ($uXS) { + $src[$i] = undef if _ignorable_simple($src[$i]); + } } } @@ -623,7 +569,8 @@ sub splitEnt } # skip completely ignorable - if ($map->{$jcps} && @{ $map->{$jcps} } == 0) { + if ($uXS && $jcps =~ /^[0-9]+\z/ && _ignorable_simple($jcps) || + $map->{$jcps} && @{ $map->{$jcps} } == 0) { if ($wLen && @buf) { $buf[-1][2] = $i + 1; } @@ -662,10 +609,13 @@ sub getWt my $vbl = $self->{variable}; my $map = $self->{mapping}; my $der = $self->{derivCode}; + my $uXS = $self->{__useXS}; return if !defined $u; return map(_varCE($vbl, $_), @{ $map->{$u} }) if $map->{$u}; + return map(_varCE($vbl, $_), _fetch_simple($u)) + if $uXS && _exists_simple($u); # JCPS must not be a contraction, then it's a code point. if (Hangul_SIni <= $u && $u <= Hangul_SFin) { @@ -692,7 +642,7 @@ sub getWt $map->{$contract} and @decH = ($contract, $decH[2]); } # even if V's ignorable, LT contraction is not supported. - # If such a situatution were required, NFD should be used. + # If such a situation were required, NFD should be used. } if (@decH == 3 && $max->{$decH[1]}) { my $contract = join(CODE_SEP, @decH[1,2]); @@ -701,7 +651,9 @@ sub getWt } @hangulCE = map({ - $map->{$_} ? @{ $map->{$_} } : $der->($_); + $map->{$_} ? @{ $map->{$_} } : + $uXS && _exists_simple($_) ? _fetch_simple($_) : + $der->($_); } @decH); } return map _varCE($vbl, $_), @hangulCE; @@ -726,12 +678,10 @@ sub getWt sub getSortKey { my $self = shift; - my $lev = $self->{level}; my $rEnt = $self->splitEnt(shift); # get an arrayref of JCPS my $vers = $self->{UCA_Version}; my $vbl = $self->{variable}; my $term = $self->{hangul_terminator}; - my $v2i = $vers >= 9 && $vbl ne 'non-ignorable'; my @buf; # weight arrays if ($term) { @@ -756,53 +706,7 @@ sub getSortKey } } - # make sort key - my @ret = ([],[],[],[]); - my $last_is_variable; - - foreach my $vwt (@buf) { - my($var, @wt) = unpack(VCE_TEMPLATE, $vwt); - - # "Ignorable (L1, L2) after Variable" since track. v. 9 - if ($v2i) { - if ($var) { - $last_is_variable = TRUE; - } elsif (!$wt[0]) { # ignorable - next if $last_is_variable; - } else { - $last_is_variable = FALSE; - } - } - foreach my $v (0..$lev-1) { - 0 < $wt[$v] and push @{ $ret[$v] }, $wt[$v]; - } - } - - # modification of tertiary weights - if ($self->{upper_before_lower}) { - foreach my $w (@{ $ret[2] }) { - if (0x8 <= $w && $w <= 0xC) { $w -= 6 } # lower - elsif (0x2 <= $w && $w <= 0x6) { $w += 6 } # upper - elsif ($w == 0x1C) { $w += 1 } # square upper - elsif ($w == 0x1D) { $w -= 1 } # square lower - } - } - if ($self->{katakana_before_hiragana}) { - foreach my $w (@{ $ret[2] }) { - if (0x0F <= $w && $w <= 0x13) { $w -= 2 } # katakana - elsif (0x0D <= $w && $w <= 0x0E) { $w += 5 } # hiragana - } - } - - if ($self->{backwardsFlag}) { - for (my $v = MinLevel; $v <= MaxLevel; $v++) { - if ($self->{backwardsFlag} & (1 << $v)) { - @{ $ret[$v-1] } = reverse @{ $ret[$v-1] }; - } - } - } - - join LEVEL_SEP, map pack(KEY_TEMPLATE, @$_), @ret; + return $self->mk_SortKey(\@buf); } @@ -829,174 +733,6 @@ sub sort { } -sub _derivCE_22 { - my $u = shift; - my $base = (CJK_UidIni <= $u && $u <= CJK_UidF52 || $CompatUI{$u}) - ? 0xFB40 : # CJK - (CJK_ExtAIni <= $u && $u <= CJK_ExtAFin || - CJK_ExtBIni <= $u && $u <= CJK_ExtBFin || - CJK_ExtCIni <= $u && $u <= CJK_ExtCFin || - CJK_ExtDIni <= $u && $u <= CJK_ExtDFin) - ? 0xFB80 # CJK ext. - : 0xFBC0; # others - my $aaaa = $base + ($u >> 15); - my $bbbb = ($u & 0x7FFF) | 0x8000; - return pack(VCE_TEMPLATE, NON_VAR, $aaaa, Min2Wt, Min3Wt, $u), - pack(VCE_TEMPLATE, NON_VAR, $bbbb, 0, 0, $u); -} - -sub _derivCE_20 { - my $u = shift; - my $base = (CJK_UidIni <= $u && $u <= CJK_UidF52 || $CompatUI{$u}) - ? 0xFB40 : # CJK - (CJK_ExtAIni <= $u && $u <= CJK_ExtAFin || - CJK_ExtBIni <= $u && $u <= CJK_ExtBFin || - CJK_ExtCIni <= $u && $u <= CJK_ExtCFin) - ? 0xFB80 # CJK ext. - : 0xFBC0; # others - my $aaaa = $base + ($u >> 15); - my $bbbb = ($u & 0x7FFF) | 0x8000; - return pack(VCE_TEMPLATE, NON_VAR, $aaaa, Min2Wt, Min3Wt, $u), - pack(VCE_TEMPLATE, NON_VAR, $bbbb, 0, 0, $u); -} - -sub _derivCE_18 { - my $u = shift; - my $base = (CJK_UidIni <= $u && $u <= CJK_UidF51 || $CompatUI{$u}) - ? 0xFB40 : # CJK - (CJK_ExtAIni <= $u && $u <= CJK_ExtAFin || - CJK_ExtBIni <= $u && $u <= CJK_ExtBFin) - ? 0xFB80 # CJK ext. - : 0xFBC0; # others - my $aaaa = $base + ($u >> 15); - my $bbbb = ($u & 0x7FFF) | 0x8000; - return pack(VCE_TEMPLATE, NON_VAR, $aaaa, Min2Wt, Min3Wt, $u), - pack(VCE_TEMPLATE, NON_VAR, $bbbb, 0, 0, $u); -} - -sub _derivCE_14 { - my $u = shift; - my $base = (CJK_UidIni <= $u && $u <= CJK_UidF41 || $CompatUI{$u}) - ? 0xFB40 : # CJK - (CJK_ExtAIni <= $u && $u <= CJK_ExtAFin || - CJK_ExtBIni <= $u && $u <= CJK_ExtBFin) - ? 0xFB80 # CJK ext. - : 0xFBC0; # others - my $aaaa = $base + ($u >> 15); - my $bbbb = ($u & 0x7FFF) | 0x8000; - return pack(VCE_TEMPLATE, NON_VAR, $aaaa, Min2Wt, Min3Wt, $u), - pack(VCE_TEMPLATE, NON_VAR, $bbbb, 0, 0, $u); -} - -sub _derivCE_9 { - my $u = shift; - my $base = (CJK_UidIni <= $u && $u <= CJK_UidFin || $CompatUI{$u}) - ? 0xFB40 : # CJK - (CJK_ExtAIni <= $u && $u <= CJK_ExtAFin || - CJK_ExtBIni <= $u && $u <= CJK_ExtBFin) - ? 0xFB80 # CJK ext. - : 0xFBC0; # others - my $aaaa = $base + ($u >> 15); - my $bbbb = ($u & 0x7FFF) | 0x8000; - return pack(VCE_TEMPLATE, NON_VAR, $aaaa, Min2Wt, Min3Wt, $u), - pack(VCE_TEMPLATE, NON_VAR, $bbbb, 0, 0, $u); -} - -sub _derivCE_8 { - my $code = shift; - my $aaaa = 0xFF80 + ($code >> 15); - my $bbbb = ($code & 0x7FFF) | 0x8000; - return pack(VCE_TEMPLATE, NON_VAR, $aaaa, 2, 1, $code), - pack(VCE_TEMPLATE, NON_VAR, $bbbb, 0, 0, $code); -} - -sub _uideoCE_8 { - my $u = shift; - return pack(VCE_TEMPLATE, NON_VAR, $u, Min2Wt, Min3Wt, $u); -} - -sub _isUIdeo { - # $uca_vers = 0 for _uideoCE_8() - my ($u, $uca_vers) = @_; - return((CJK_UidIni <= $u && ( - $uca_vers >= 20 ? ($u <= CJK_UidF52) : - $uca_vers >= 18 ? ($u <= CJK_UidF51) : - $uca_vers >= 14 ? ($u <= CJK_UidF41) : - ($u <= CJK_UidFin))) || $CompatUI{$u} - || - (CJK_ExtAIni <= $u && $u <= CJK_ExtAFin) - || - ($uca_vers >= 8 && CJK_ExtBIni <= $u && $u <= CJK_ExtBFin) - || - ($uca_vers >= 20 && CJK_ExtCIni <= $u && $u <= CJK_ExtCFin) - || - ($uca_vers >= 22 && CJK_ExtDIni <= $u && $u <= CJK_ExtDFin) - ); -} - - -## -## "hhhh hhhh hhhh" to (dddd, dddd, dddd) -## -sub _getHexArray { map hex, $_[0] =~ /([0-9a-fA-F]+)/g } - -# -# $code *must* be in Hangul syllable. -# Check it before you enter here. -# -sub _decompHangul { - my $code = shift; - my $si = $code - Hangul_SBase; - my $li = int( $si / Hangul_NCount); - my $vi = int(($si % Hangul_NCount) / Hangul_TCount); - my $ti = $si % Hangul_TCount; - return ( - Hangul_LBase + $li, - Hangul_VBase + $vi, - $ti ? (Hangul_TBase + $ti) : (), - ); -} - -sub _isIllegal { - my $code = shift; - return((! defined $code) # removed - || ($code < 0 || 0x10FFFF < $code) # out of range - ); -} - -sub _isNonchar { - my $code = shift; - return((($code & 0xFFFE) == 0xFFFE) # ??FFF[EF] (cf. utf8.c) - || (0xD800 <= $code && $code <= 0xDFFF) # unpaired surrogates - || (0xFDD0 <= $code && $code <= 0xFDEF) # other non-characters - ); -} - -# Hangul Syllable Type -sub getHST { - my $u = shift; - my $vers = shift || 0; - - if (Hangul_SIni <= $u && $u <= Hangul_SFin) { - return +($u - Hangul_SBase) % Hangul_TCount ? "LVT" : "LV"; - } - - if ($vers < 20) { - return Hangul_LIni <= $u && $u <= Hangul_LFin || - $u == Hangul_LFill ? "L" : - Hangul_VIni <= $u && $u <= Hangul_VFin ? "V" : - Hangul_TIni <= $u && $u <= Hangul_TFin ? "T" : ""; - } else { - return Hangul_LIni <= $u && $u <= Hangul_LEnd || - HangulL2Ini <= $u && $u <= HangulL2Fin ? "L" : - Hangul_VIni <= $u && $u <= Hangul_VEnd || - HangulV2Ini <= $u && $u <= HangulV2Fin ? "V" : - Hangul_TIni <= $u && $u <= Hangul_TEnd || - HangulT2Ini <= $u && $u <= HangulT2Fin ? "T" : ""; - } -} - - ## ## bool _nonIgnorAtLevel(arrayref weights, int level) ## @@ -1023,7 +759,7 @@ sub _eqArray($$$) my $lev = shift; for my $g (0..@$substr-1){ - # Do the $g'th graphemes have the same number of AV weigths? + # Do the $g'th graphemes have the same number of AV weights? return if @{ $source->[$g] } != @{ $substr->[$g] }; for my $w (0..@{ $substr->[$g] }-1) { @@ -1321,7 +1057,7 @@ The following tracking versions are supported. The default is 20. Note: Recent UTS #10 renames "Tracking Version" to "Revision." -* Noncharacters (e.g. U+FFFF) are not ignored, and can be overrided +* Noncharacters (e.g. U+FFFF) are not ignored, and can be overridden since C 22. * Fully ignorable characters were ignored, and would not interrupt @@ -1359,7 +1095,7 @@ forwards at all the levels. If the same character (or a sequence of characters) exists in the collation element table through C
, -mapping to collation elements is overrided. +mapping to collation elements is overridden. If it does not exist, the mapping is defined additionally. entry => <<'ENTRY', # for DUCET v4.0.0 (allkeys-4.0.0.txt) @@ -1536,7 +1272,7 @@ those in the CJK Unified Ideographs Extension A etc. Ext.D (U+2B740..U+2B81D) if UCA_Version is 22 or greater. Through C, ordering of CJK unified ideographs (including -extensions) can be overrided. +extensions) can be overridden. ex. CJK unified ideographs in the JIS code point order. @@ -1579,7 +1315,7 @@ in C
or C is still valid. B In addition to them, 12 CJK compatibility ideographs (C, C, C, C, C, C, C, C, C, C, C, C) are also treated as CJK unified -ideographs. But they can't be overrided via C when you use +ideographs. But they can't be overridden via C when you use DUCET, as the table includes weights for them. C
or C has priority over C. @@ -1589,7 +1325,7 @@ priority over C. By default, Hangul syllables are decomposed into Hangul Jamo, even if C<(normalization =E undef)>. -But the mapping of Hangul syllables may be overrided. +But the mapping of Hangul syllables may be overridden. This parameter works like C, so see there for examples. @@ -1750,7 +1486,7 @@ this parameter doesn't work validly. This key allows to variable weighting for variable collation elements, which are marked with an ASTERISK in the table -(NOTE: Many punction marks and symbols are variable in F). +(NOTE: Many punctuation marks and symbols are variable in F). variable => 'blanked', 'non-ignorable', 'shifted', or 'shift-trimmed'. @@ -2058,7 +1794,7 @@ B =head1 AUTHOR, COPYRIGHT AND LICENSE The Unicode::Collate module for perl was written by SADAHIRO Tomoyuki, -. This module is Copyright(C) 2001-2010, +. This module is Copyright(C) 2001-2011, SADAHIRO Tomoyuki. Japan. All rights reserved. This module is free software; you can redistribute it and/or diff --git a/cpan/Unicode-Collate/Collate.xs b/cpan/Unicode-Collate/Collate.xs new file mode 100644 index 0000000..d96912b --- /dev/null +++ b/cpan/Unicode-Collate/Collate.xs @@ -0,0 +1,691 @@ +#include "EXTERN.h" +#include "perl.h" +#include "XSUB.h" + +/* This file is prepared by mkheader */ +#include "ucatbl.h" + +/* Perl 5.6.1 ? */ +#ifndef utf8n_to_uvuni +#define utf8n_to_uvuni utf8_to_uv +#endif /* utf8n_to_uvuni */ + +/* UTF8_ALLOW_BOM is used before Perl 5.8.0 */ +#ifndef UTF8_ALLOW_BOM +#define UTF8_ALLOW_BOM (0) +#endif /* UTF8_ALLOW_BOM */ + +#ifndef UTF8_ALLOW_SURROGATE +#define UTF8_ALLOW_SURROGATE (0) +#endif /* UTF8_ALLOW_SURROGATE */ + +#ifndef UTF8_ALLOW_FE_FF +#define UTF8_ALLOW_FE_FF (0) +#endif /* UTF8_ALLOW_FE_FF */ + +#ifndef UTF8_ALLOW_FFFF +#define UTF8_ALLOW_FFFF (0) +#endif /* UTF8_ALLOW_FFFF */ + +#define AllowAnyUTF (UTF8_ALLOW_SURROGATE|UTF8_ALLOW_BOM|UTF8_ALLOW_FE_FF|UTF8_ALLOW_FFFF) + +/* if utf8n_to_uvuni() sets retlen to 0 (?) */ +#define ErrRetlenIsZero "panic (Unicode::Collate): zero-length character" + +/* At present, char > 0x10ffff are unaffected without complaint, right? */ +#define VALID_UTF_MAX (0x10ffff) +#define OVER_UTF_MAX(uv) (VALID_UTF_MAX < (uv)) + +static const UV max_div_16 = UV_MAX / 16; + +/* Supported Levels */ +#define MinLevel (1) +#define MaxLevel (4) + +/* Shifted weight at 4th level */ +#define Shift4Wt (0xFFFF) + +#define VCE_Length (9) + +#define Hangul_SBase (0xAC00) +#define Hangul_SIni (0xAC00) +#define Hangul_SFin (0xD7A3) +#define Hangul_NCount (588) +#define Hangul_TCount (28) +#define Hangul_LBase (0x1100) +#define Hangul_LIni (0x1100) +#define Hangul_LFin (0x1159) +#define Hangul_LFill (0x115F) +#define Hangul_LEnd (0x115F) /* Unicode 5.2 */ +#define Hangul_VBase (0x1161) +#define Hangul_VIni (0x1160) /* from Vowel Filler */ +#define Hangul_VFin (0x11A2) +#define Hangul_VEnd (0x11A7) /* Unicode 5.2 */ +#define Hangul_TBase (0x11A7) /* from "no-final" codepoint */ +#define Hangul_TIni (0x11A8) +#define Hangul_TFin (0x11F9) +#define Hangul_TEnd (0x11FF) /* Unicode 5.2 */ +#define HangulL2Ini (0xA960) /* Unicode 5.2 */ +#define HangulL2Fin (0xA97C) /* Unicode 5.2 */ +#define HangulV2Ini (0xD7B0) /* Unicode 5.2 */ +#define HangulV2Fin (0xD7C6) /* Unicode 5.2 */ +#define HangulT2Ini (0xD7CB) /* Unicode 5.2 */ +#define HangulT2Fin (0xD7FB) /* Unicode 5.2 */ + +#define CJK_UidIni (0x4E00) +#define CJK_UidFin (0x9FA5) +#define CJK_UidF41 (0x9FBB) +#define CJK_UidF51 (0x9FC3) +#define CJK_UidF52 (0x9FCB) +#define CJK_ExtAIni (0x3400) /* Unicode 3.0 */ +#define CJK_ExtAFin (0x4DB5) /* Unicode 3.0 */ +#define CJK_ExtBIni (0x20000) /* Unicode 3.1 */ +#define CJK_ExtBFin (0x2A6D6) /* Unicode 3.1 */ +#define CJK_ExtCIni (0x2A700) /* Unicode 5.2 */ +#define CJK_ExtCFin (0x2B734) /* Unicode 5.2 */ +#define CJK_ExtDIni (0x2B740) /* Unicode 6.0 */ +#define CJK_ExtDFin (0x2B81D) /* Unicode 6.0 */ + +static STDCHAR UnifiedCompat[] = { + 1,1,0,1,0,1,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,1,0,0,1,1,1 +}; /* E F 0 1 2 3 4 5 6 7 8 9 A B C D E F 0 1 2 3 4 5 6 7 8 9 */ + +#define codeRange(bcode, ecode) ((bcode) <= code && code <= (ecode)) + +MODULE = Unicode::Collate PACKAGE = Unicode::Collate + +PROTOTYPES: DISABLE + +void +_fetch_rest () + PREINIT: + char ** rest; + PPCODE: + for (rest = UCA_rest; *rest; ++rest) { + XPUSHs(sv_2mortal(newSVpv((char *) *rest, 0))); + } + + +void +_fetch_simple (uv) + UV uv + PREINIT: + U8 ***plane, **row; + U8* result = NULL; + PPCODE: + if (!OVER_UTF_MAX(uv)){ + plane = (U8***)UCA_simple[uv >> 16]; + if (plane) { + row = plane[(uv >> 8) & 0xff]; + result = row ? row[uv & 0xff] : NULL; + } + } + if (result) { + int i; + int num = (int)*result; + ++result; + for (i = 0; i < num; ++i) { + XPUSHs(sv_2mortal(newSVpvn((char *) result, VCE_Length))); + result += VCE_Length; + } + } else { + XPUSHs(sv_2mortal(newSViv(0))); + } + +SV* +_ignorable_simple (uv) + UV uv + ALIAS: + _exists_simple = 1 + PREINIT: + U8 ***plane, **row; + int num = -1; + U8* result = NULL; + CODE: + if (!OVER_UTF_MAX(uv)){ + plane = (U8***)UCA_simple[uv >> 16]; + if (plane) { + row = plane[(uv >> 8) & 0xff]; + result = row ? row[uv & 0xff] : NULL; + } + if (result) + num = (int)*result; /* assuming 0 <= num < 128 */ + } + + if (ix) + RETVAL = boolSV(num >0); + else + RETVAL = boolSV(num==0); + OUTPUT: + RETVAL + + +void +_getHexArray (src) + SV* src + PREINIT: + char *s, *e; + STRLEN byte; + UV value; + bool overflowed = FALSE; + const char *hexdigit; + PPCODE: + s = SvPV(src,byte); + for (e = s + byte; s < e;) { + hexdigit = strchr((char *) PL_hexdigit, *s++); + if (! hexdigit) + continue; + value = (hexdigit - PL_hexdigit) & 0xF; + while (*s) { + hexdigit = strchr((char *) PL_hexdigit, *s++); + if (! hexdigit) + break; + if (overflowed) + continue; + if (value > max_div_16) { + overflowed = TRUE; + continue; + } + value = (value << 4) | ((hexdigit - PL_hexdigit) & 0xF); + } + XPUSHs(sv_2mortal(newSVuv(overflowed ? UV_MAX : value))); + } + + +SV* +_isIllegal (sv) + SV* sv + PREINIT: + UV uv; + CODE: + if (!sv || !SvIOK(sv)) + XSRETURN_YES; + uv = SvUVX(sv); + RETVAL = boolSV( + 0x10FFFF < uv /* out of range */ + ); +OUTPUT: + RETVAL + + +SV* +_isNonchar (sv) + SV* sv + PREINIT: + UV uv; + CODE: + /* should be called only if ! _isIllegal(sv). */ + uv = SvUVX(sv); + RETVAL = boolSV( + ((uv & 0xFFFE) == 0xFFFE) /* ??FFF[EF] (cf. utf8.c) */ + || (0xD800 <= uv && uv <= 0xDFFF) /* unpaired surrogates */ + || (0xFDD0 <= uv && uv <= 0xFDEF) /* other non-characters */ + ); +OUTPUT: + RETVAL + + +void +_decompHangul (code) + UV code + PREINIT: + UV sindex, lindex, vindex, tindex; + PPCODE: + /* code *must* be in Hangul syllable. + * Check it before you enter here. */ + sindex = code - Hangul_SBase; + lindex = sindex / Hangul_NCount; + vindex = (sindex % Hangul_NCount) / Hangul_TCount; + tindex = sindex % Hangul_TCount; + + XPUSHs(sv_2mortal(newSVuv(lindex + Hangul_LBase))); + XPUSHs(sv_2mortal(newSVuv(vindex + Hangul_VBase))); + if (tindex) + XPUSHs(sv_2mortal(newSVuv(tindex + Hangul_TBase))); + + +SV* +getHST (code, uca_vers = 0) + UV code; + IV uca_vers; + PREINIT: + char * hangtype; + STRLEN typelen; + CODE: + if (codeRange(Hangul_SIni, Hangul_SFin)) { + if ((code - Hangul_SBase) % Hangul_TCount) { + hangtype = "LVT"; typelen = 3; + } else { + hangtype = "LV"; typelen = 2; + } + } else if (uca_vers < 20) { + if (codeRange(Hangul_LIni, Hangul_LFin) || code == Hangul_LFill) { + hangtype = "L"; typelen = 1; + } else if (codeRange(Hangul_VIni, Hangul_VFin)) { + hangtype = "V"; typelen = 1; + } else if (codeRange(Hangul_TIni, Hangul_TFin)) { + hangtype = "T"; typelen = 1; + } else { + hangtype = ""; typelen = 0; + } + } else { + if (codeRange(Hangul_LIni, Hangul_LEnd) || + codeRange(HangulL2Ini, HangulL2Fin)) { + hangtype = "L"; typelen = 1; + } else if (codeRange(Hangul_VIni, Hangul_VEnd) || + codeRange(HangulV2Ini, HangulV2Fin)) { + hangtype = "V"; typelen = 1; + } else if (codeRange(Hangul_TIni, Hangul_TEnd) || + codeRange(HangulT2Ini, HangulT2Fin)) { + hangtype = "T"; typelen = 1; + } else { + hangtype = ""; typelen = 0; + } + } + + RETVAL = newSVpvn(hangtype, typelen); +OUTPUT: + RETVAL + + +void +_derivCE_9 (code) + UV code + ALIAS: + _derivCE_14 = 1 + _derivCE_18 = 2 + _derivCE_20 = 3 + _derivCE_22 = 4 + PREINIT: + UV base, aaaa, bbbb; + U8 a[VCE_Length + 1] = "\x00\xFF\xFF\x00\x20\x00\x02\xFF\xFF"; + U8 b[VCE_Length + 1] = "\x00\xFF\xFF\x00\x00\x00\x00\xFF\xFF"; + bool basic_unified = 0; + PPCODE: + if (CJK_UidIni <= code) { + if (codeRange(0xFA0E, 0xFA29)) + basic_unified = (bool)UnifiedCompat[code - 0xFA0E]; + else + basic_unified = (ix >= 3 ? (code <= CJK_UidF52) : + ix == 2 ? (code <= CJK_UidF51) : + ix == 1 ? (code <= CJK_UidF41) : + (code <= CJK_UidFin)); + } + base = (basic_unified) + ? 0xFB40 : /* CJK */ + ((codeRange(CJK_ExtAIni, CJK_ExtAFin)) + || + (codeRange(CJK_ExtBIni, CJK_ExtBFin)) + || + (ix >= 3 && codeRange(CJK_ExtCIni, CJK_ExtCFin)) + || + (ix >= 4 && codeRange(CJK_ExtDIni, CJK_ExtDFin))) + ? 0xFB80 /* CJK ext. */ + : 0xFBC0; /* others */ + aaaa = base + (code >> 15); + bbbb = (code & 0x7FFF) | 0x8000; + a[1] = (U8)(aaaa >> 8); + a[2] = (U8)(aaaa & 0xFF); + b[1] = (U8)(bbbb >> 8); + b[2] = (U8)(bbbb & 0xFF); + a[7] = b[7] = (U8)(code >> 8); + a[8] = b[8] = (U8)(code & 0xFF); + XPUSHs(sv_2mortal(newSVpvn((char *) a, VCE_Length))); + XPUSHs(sv_2mortal(newSVpvn((char *) b, VCE_Length))); + + +void +_derivCE_8 (code) + UV code + PREINIT: + UV aaaa, bbbb; + U8 a[VCE_Length + 1] = "\x00\xFF\xFF\x00\x02\x00\x01\xFF\xFF"; + U8 b[VCE_Length + 1] = "\x00\xFF\xFF\x00\x00\x00\x00\xFF\xFF"; + PPCODE: + aaaa = 0xFF80 + (code >> 15); + bbbb = (code & 0x7FFF) | 0x8000; + a[1] = (U8)(aaaa >> 8); + a[2] = (U8)(aaaa & 0xFF); + b[1] = (U8)(bbbb >> 8); + b[2] = (U8)(bbbb & 0xFF); + a[7] = b[7] = (U8)(code >> 8); + a[8] = b[8] = (U8)(code & 0xFF); + XPUSHs(sv_2mortal(newSVpvn((char *) a, VCE_Length))); + XPUSHs(sv_2mortal(newSVpvn((char *) b, VCE_Length))); + + +void +_uideoCE_8 (code) + UV code + PREINIT: + U8 uice[VCE_Length + 1] = "\x00\xFF\xFF\x00\x20\x00\x02\xFF\xFF"; + PPCODE: + uice[1] = uice[7] = (U8)(code >> 8); + uice[2] = uice[8] = (U8)(code & 0xFF); + XPUSHs(sv_2mortal(newSVpvn((char *) uice, VCE_Length))); + + +SV* +_isUIdeo (code, uca_vers) + UV code; + IV uca_vers; + bool basic_unified = 0; + CODE: + /* uca_vers = 0 for _uideoCE_8() */ + if (CJK_UidIni <= code) { + if (codeRange(0xFA0E, 0xFA29)) + basic_unified = (bool)UnifiedCompat[code - 0xFA0E]; + else + basic_unified = (uca_vers >= 20 ? (code <= CJK_UidF52) : + uca_vers >= 18 ? (code <= CJK_UidF51) : + uca_vers >= 14 ? (code <= CJK_UidF41) : + (code <= CJK_UidFin)); + } + RETVAL = boolSV( + (basic_unified) + || + (codeRange(CJK_ExtAIni, CJK_ExtAFin)) + || + (uca_vers >= 8 && codeRange(CJK_ExtBIni, CJK_ExtBFin)) + || + (uca_vers >= 20 && codeRange(CJK_ExtCIni, CJK_ExtCFin)) + || + (uca_vers >= 22 && codeRange(CJK_ExtDIni, CJK_ExtDFin)) + ); +OUTPUT: + RETVAL + + +SV* +mk_SortKey (self, buf) + SV* self; + SV* buf; + PREINIT: + SV *dst, **svp; + STRLEN dlen, vlen; + U8 *d, *p, *e, *v, *s[MaxLevel], *eachlevel[MaxLevel]; + AV *bufAV; + HV *selfHV; + UV back_flag; + I32 i, buf_len; + IV lv, level, uca_vers; + bool upper_lower, kata_hira, v2i, last_is_var; + CODE: + if (SvROK(self) && SvTYPE(SvRV(self)) == SVt_PVHV) + selfHV = (HV*)SvRV(self); + else + croak("$self is not a HASHREF."); + + svp = hv_fetch(selfHV, "level", 5, FALSE); + level = svp ? SvIV(*svp) : MaxLevel; + + if (SvROK(buf) && SvTYPE(SvRV(buf)) == SVt_PVAV) + bufAV = (AV*)SvRV(buf); + else + croak("XSUB, not an ARRAYREF."); + + buf_len = av_len(bufAV); + + if (buf_len < 0) { /* empty: -1 */ + dlen = 2 * (MaxLevel - 1); + dst = newSV(dlen); + (void)SvPOK_only(dst); + d = (U8*)SvPVX(dst); + while (dlen--) + *d++ = '\0'; + } + else { + for (lv = 0; lv < level; lv++) { + New(0, eachlevel[lv], 2 * (1 + buf_len) + 1, U8); + s[lv] = eachlevel[lv]; + } + + svp = hv_fetch(selfHV, "upper_before_lower", 18, FALSE); + upper_lower = svp ? SvTRUE(*svp) : FALSE; + svp = hv_fetch(selfHV, "katakana_before_hiragana", 24, FALSE); + kata_hira = svp ? SvTRUE(*svp) : FALSE; + svp = hv_fetch(selfHV, "UCA_Version", 11, FALSE); + uca_vers = SvIV(*svp); + svp = hv_fetch(selfHV, "variable", 8, FALSE); + v2i = uca_vers >= 9 && svp /* (vers >= 9) and not (non-ignorable) */ + ? !(SvCUR(*svp) == 13 && memEQ(SvPVX(*svp), "non-ignorable", 13)) + : FALSE; + + last_is_var = FALSE; + for (i = 0; i <= buf_len; i++) { + svp = av_fetch(bufAV, i, FALSE); + + if (svp && SvPOK(*svp)) + v = (U8*)SvPV(*svp, vlen); + else + croak("not a vwt."); + + if (vlen < VCE_Length) /* ignore short VCE (unexpected) */ + continue; + + /* "Ignorable (L1, L2) after Variable" since track. v. 9 */ + if (v2i) { + if (*v) + last_is_var = TRUE; + else if (v[1] || v[2]) /* non zero primary weight */ + last_is_var = FALSE; + else if (last_is_var) /* zero primary weight; skipped */ + continue; + } + + if (v[5] == 0) { /* tert wt < 256 */ + if (upper_lower) { + if (0x8 <= v[6] && v[6] <= 0xC) /* lower */ + v[6] -= 6; + else if (0x2 <= v[6] && v[6] <= 0x6) /* upper */ + v[6] += 6; + else if (v[6] == 0x1C) /* square upper */ + v[6]++; + else if (v[6] == 0x1D) /* square lower */ + v[6]--; + } + if (kata_hira) { + if (0x0F <= v[6] && v[6] <= 0x13) /* katakana */ + v[6] -= 2; + else if (0xD <= v[6] && v[6] <= 0xE) /* hiragana */ + v[6] += 5; + } + } + + for (lv = 0; lv < level; lv++) { + if (v[2 * lv + 1] || v[2 * lv + 2]) { + *s[lv]++ = v[2 * lv + 1]; + *s[lv]++ = v[2 * lv + 2]; + } + } + } + + dlen = 2 * (MaxLevel - 1); + for (lv = 0; lv < level; lv++) + dlen += s[lv] - eachlevel[lv]; + + dst = newSV(dlen); + (void)SvPOK_only(dst); + d = (U8*)SvPVX(dst); + + svp = hv_fetch(selfHV, "backwardsFlag", 13, FALSE); + back_flag = svp ? SvUV(*svp) : (UV)0; + + for (lv = 0; lv < level; lv++) { + if (back_flag & (1 << (lv + 1))) { + p = s[lv]; + e = eachlevel[lv]; + for ( ; e < p; p -= 2) { + *d++ = p[-2]; + *d++ = p[-1]; + } + } + else { + p = eachlevel[lv]; + e = s[lv]; + while (p < e) + *d++ = *p++; + } + if (lv + 1 < MaxLevel) { /* lv + 1 == real level */ + *d++ = '\0'; + *d++ = '\0'; + } + } + + for (lv = level; lv < MaxLevel; lv++) { + if (lv + 1 < MaxLevel) { /* lv + 1 == real level */ + *d++ = '\0'; + *d++ = '\0'; + } + } + + for (lv = 0; lv < level; lv++) { + Safefree(eachlevel[lv]); + } + } + *d = '\0'; + SvCUR_set(dst, d - (U8*)SvPVX(dst)); + RETVAL = dst; +OUTPUT: + RETVAL + + +SV* +_varCE (vbl, vce) + SV* vbl + SV* vce + PREINIT: + SV *dst; + U8 *a, *v, *d; + STRLEN alen, vlen; + CODE: + a = (U8*)SvPV(vbl, alen); + v = (U8*)SvPV(vce, vlen); + + dst = newSV(vlen); + d = (U8*)SvPVX(dst); + (void)SvPOK_only(dst); + Copy(v, d, vlen, U8); + SvCUR_set(dst, vlen); + d[vlen] = '\0'; + + /* variable: checked only the first char and the length, + trusting checkCollator() and %VariableOK in Perl ... */ + + if (vlen < VCE_Length /* ignore short VCE (unexpected) */ + || + *a == 'n') /* 'non-ignorable' */ + 1; + else if (*v) { + if (*a == 's') { /* shifted or shift-trimmed */ + d[7] = d[1]; /* wt level 1 to 4 */ + d[8] = d[2]; + } + d[1] = d[2] = d[3] = d[4] = d[5] = d[6] = '\0'; + } + else if (*a == 'b') /* blanked */ + 1; + else if (*a == 's') { /* shifted or shift-trimmed */ + if (alen == 7 && (d[1] + d[2] + d[3] + d[4] + d[5] + d[6])) { + d[7] = (U8)(Shift4Wt >> 8); + d[8] = (U8)(Shift4Wt & 0xFF); + } + else { + d[7] = d[8] = 0; + } + } + else + croak("unknown variable value '%s'", a); + RETVAL = dst; +OUTPUT: + RETVAL + + + +SV* +visualizeSortKey (self, key) + SV * self + SV * key + PREINIT: + HV *selfHV; + SV **svp, *dst; + U8 *s, *e, *d; + STRLEN klen, dlen; + UV uv; + IV uca_vers; + static char *upperhex = "0123456789ABCDEF"; + CODE: + if (SvROK(self) && SvTYPE(SvRV(self)) == SVt_PVHV) + selfHV = (HV*)SvRV(self); + else + croak("$self is not a HASHREF."); + + svp = hv_fetch(selfHV, "UCA_Version", 11, FALSE); + if (!svp) + croak("Panic: no $self->{UCA_Version} in visualizeSortKey"); + uca_vers = SvIV(*svp); + + s = (U8*)SvPV(key, klen); + + /* slightly *longer* than the need, but I'm afraid of miscounting; + exactly: (klen / 2) * 5 + MaxLevel * 2 - 1 (excluding '\0') + = (klen / 2) * 5 - 1 # FFFF (16bit) and ' ' between 16bit units + + (MaxLevel - 1) * 2 # ' ' and '|' for level boundaries + + 2 # '[' and ']' + */ + dlen = (klen / 2) * 5 + MaxLevel * 2 + 2; + dst = newSV(dlen); + (void)SvPOK_only(dst); + d = (U8*)SvPVX(dst); + + *d++ = '['; + for (e = s + klen; s < e; s += 2) { + uv = (U16)(*s << 8 | s[1]); + if (uv) { + if ((d[-1] != '[') && ((9 <= uca_vers) || (d[-1] != '|'))) + *d++ = ' '; + *d++ = upperhex[ (s[0] >> 4) & 0xF ]; + *d++ = upperhex[ s[0] & 0xF ]; + *d++ = upperhex[ (s[1] >> 4) & 0xF ]; + *d++ = upperhex[ s[1] & 0xF ]; + } + else { + if ((9 <= uca_vers) && (d[-1] != '[')) + *d++ = ' '; + *d++ = '|'; + } + } + *d++ = ']'; + *d = '\0'; + SvCUR_set(dst, d - (U8*)SvPVX(dst)); + RETVAL = dst; +OUTPUT: + RETVAL + + + +void +unpack_U (src) + SV* src + PREINIT: + STRLEN srclen, retlen; + U8 *s, *p, *e; + UV uv; + PPCODE: + s = (U8*)SvPV(src,srclen); + if (!SvUTF8(src)) { + SV* tmpsv = sv_mortalcopy(src); + if (!SvPOK(tmpsv)) + (void)sv_pvn_force(tmpsv,&srclen); + sv_utf8_upgrade(tmpsv); + s = (U8*)SvPV(tmpsv,srclen); + } + e = s + srclen; + + for (p = s; p < e; p += retlen) { + uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF); + if (!retlen) + croak(ErrRetlenIsZero); + XPUSHs(sv_2mortal(newSVuv(uv))); + } + diff --git a/cpan/Unicode-Collate/Collate/Locale.pm b/cpan/Unicode-Collate/Collate/Locale.pm index 5dddfb8..c589144 100644 --- a/cpan/Unicode-Collate/Collate/Locale.pm +++ b/cpan/Unicode-Collate/Collate/Locale.pm @@ -4,12 +4,11 @@ use strict; use Carp; use base qw(Unicode::Collate); -our $VERSION = '0.68'; +our $VERSION = '0.71'; use File::Spec; (my $ModPath = $INC{'Unicode/Collate/Locale.pm'}) =~ s/\.pm$//; -my $KeyPath = File::Spec->catfile('allkeys.txt'); my $PL_EXT = '.pl'; my %LocaleFile = map { ($_, $_) } qw( @@ -71,7 +70,6 @@ sub new { if (exists $hash{table}) { croak "your table can't be used with Unicode::Collate::Locale"; } - $hash{table} = $KeyPath; my $href = _fetchpl($hash{accepted_locale}); while (my($k,$v) = each %$href) { @@ -297,7 +295,7 @@ tailored as well as it. For example, even though W is tailored, fullwidth W (C), W with acute (C), etc. are not tailored. The result may depend on whether source strings are normalized or not, and whether decomposed or composed. -Thus C<(normalization =E undef> is less preferred. +Thus C<(normalization =E undef)> is less preferred. =back @@ -305,7 +303,7 @@ Thus C<(normalization =E undef> is less preferred. The Unicode::Collate::Locale module for perl was written by SADAHIRO Tomoyuki, . -This module is Copyright(C) 2004-2010, SADAHIRO Tomoyuki. Japan. +This module is Copyright(C) 2004-2011, SADAHIRO Tomoyuki. Japan. All rights reserved. This module is free software; you can redistribute it and/or diff --git a/cpan/Unicode-Collate/Makefile.PL b/cpan/Unicode-Collate/Makefile.PL new file mode 100644 index 0000000..30d6fc0 --- /dev/null +++ b/cpan/Unicode-Collate/Makefile.PL @@ -0,0 +1,28 @@ +require 5.006001; +use ExtUtils::MakeMaker; + +my $clean = {}; + +if (-f "Collate.xs") { + print STDERR "Making header files for XS...\n"; + + do 'mkheader' or die $@ || "mkheader: $!"; + + $clean = { FILES => 'ucatbl.h' }; +} + +WriteMakefile( + 'INSTALLDIRS' => $] >= 5.007002 ? 'perl' : 'site', + 'NAME' => 'Unicode::Collate', + 'VERSION_FROM' => 'Collate.pm', # finds $VERSION + 'clean' => $clean, + 'PREREQ_PM' => { + Carp => 0, + constant => 0, + DynaLoader => 0, + File::Spec => 0, + strict => 0, + Test => 0, + warnings => 0, + }, +); diff --git a/cpan/Unicode-Collate/README b/cpan/Unicode-Collate/README index 16bf8c4..743d713 100644 --- a/cpan/Unicode-Collate/README +++ b/cpan/Unicode-Collate/README @@ -1,4 +1,4 @@ -Unicode/Collate version 0.68 +Unicode/Collate version 0.72 =============================== NAME @@ -40,6 +40,7 @@ INSTALL gendata/*, and mklocale. Tests for Unicode::Collate::Locale are named t/loc_*.t. +Since 0.54, XSUB that requires a C compiler will be built by default. To install this module type the following: perl Makefile.PL @@ -47,20 +48,20 @@ To install this module type the following: make test make install -If you have a C compiler and want to use XSUB edition, -type the following (!! "enableXS" must run before "Makefile.PL" !!): +Even if a C compiler is not available, pure Perl (i.e. non-XS) edition +is available; type the following: - perl enableXS + perl disableXS perl Makefile.PL make make test make install -If you decide to install pure Perl (i.e. non-XS) edition after trying -to build XSUB, type the following: +If you decide to install XSUB edition after trying to build pure Perl, +type the following: make clean - perl disableXS + perl enableXS perl Makefile.PL make make test @@ -107,7 +108,7 @@ HOW TO CHANGE DUCET (NOT WARRANTED) AUTHOR, COPYRIGHT AND LICENSE The Unicode::Collate module for perl was written by SADAHIRO Tomoyuki, -. This module is Copyright(C) 2001-2010, +. This module is Copyright(C) 2001-2011, SADAHIRO Tomoyuki. Japan. All rights reserved. This module is free software; you can redistribute it and/or diff --git a/cpan/Unicode-Collate/mkheader b/cpan/Unicode-Collate/mkheader new file mode 100644 index 0000000..dde4ee1 --- /dev/null +++ b/cpan/Unicode-Collate/mkheader @@ -0,0 +1,196 @@ +#!perl +# +# This auxiliary script makes five header files +# used for building XSUB of Unicode::Collate. +# +# Usage: +# in perl, or in command line +# +# Input file: +# Collate/allkeys.txt +# +# Output file: +# ucatbl.h +# +use 5.006; +use strict; +use warnings; +use Carp; +use File::Spec; + +BEGIN { + unless ("A" eq pack('U', 0x41)) { + die "Unicode::Collate cannot stringify a Unicode code point\n"; + } +} + +use constant TRUE => 1; +use constant FALSE => ""; +use constant VCE_TEMPLATE => 'Cn4'; + +sub _getHexArray { map hex, $_[0] =~ /([0-9a-fA-F]+)/g } + +our $PACKAGE = 'Unicode::Collate, mkheader'; +our $prefix = "UCA_"; + +our %SimpleEntries; # $codepoint => $keys +our @Rest; + +{ + my($f, $fh); + foreach my $d ('.') { + $f = File::Spec->catfile($d, "Collate", "allkeys.txt"); + last if open($fh, $f); + $f = undef; + } + croak "$PACKAGE: Collate/allkeys.txt is not found" if !defined $f; + + while (my $line = <$fh>) { + next if $line =~ /^\s*#/; + if ($line =~ /^\s*\@/) { + push @Rest, $line; + next; + } + + next if $line !~ /^\s*[0-9A-Fa-f]/; + + $line =~ s/[#%]\s*(.*)//; # removing comment (not getting the name) + + # gets element + my($e, $k) = split /;/, $line; + + croak "Wrong Entry: must be separated by ';' ". + "from " if ! $k; + + my @uv = _getHexArray($e); + next if !@uv; + + if (@uv != 1) { + push @Rest, $line; + next; + } + + my $is_L3_ignorable = TRUE; + + my @key; + foreach my $arr ($k =~ /\[([^\[\]]+)\]/g) { # SPACEs allowed + my $var = $arr =~ /\*/; # exactly /^\*/ but be lenient. + my @wt = _getHexArray($arr); + push @key, pack(VCE_TEMPLATE, $var, @wt); + $is_L3_ignorable = FALSE + if $wt[0] || $wt[1] || $wt[2]; + # Conformance Test for 3.1.1 and 4.0.0 shows Level 3 ignorable + # is completely ignorable. + # For expansion, an entry $is_L3_ignorable + # if and only if "all" CEs are [.0000.0000.0000]. + } + my $mapping = $is_L3_ignorable ? [] : \@key; + my $num = @$mapping; + my $str = chr($num).join('', @$mapping); + $SimpleEntries{$uv[0]} = stringify($str); + } +} + +sub stringify { + my $str = shift; + return sprintf '"%s"', join '', + map sprintf("\\x%02x", ord $_), split //, $str; + +} + +########## writing header files ########## + +my $init = ''; +{ + my $type = "char*"; + my $head = $prefix."rest"; + + $init .= "static $type $head [] = {\n"; + for my $line (@Rest) { + $line =~ s/\s*\z//; + next if $line eq ''; + $init .= "/*$line*/\n" if $line =~ /^[A-Za-z0-9_.:;@\ \[\]]+\z/; + $init .= "($type)".stringify($line).",\n"; + } + $init .= "NULL\n"; # sentinel + $init .= "};\n\n"; +} + +my @tripletable = ( + { + file => "ucatbl", + name => "simple", + type => "char*", + hash => \%SimpleEntries, + null => "NULL", + init => $init, + }, +); + +foreach my $tbl (@tripletable) { + my $file = "$tbl->{file}.h"; + my $head = "${prefix}$tbl->{name}"; + my $type = $tbl->{type}; + my $hash = $tbl->{hash}; + my $null = $tbl->{null}; + my $init = $tbl->{init}; + + open FH, ">$file" or croak "$PACKAGE: $file can't be made"; + binmode FH; select FH; + my %val; + + print FH << 'EOF'; +/* + * This file is auto-generated by mkheader. + * Any changes here will be lost! + */ +EOF + + print $init if defined $init; + + foreach my $uv (keys %$hash) { + croak sprintf("a Unicode code point 0x%04X over 0x10FFFF.", $uv) + unless $uv <= 0x10FFFF; + my @c = unpack 'CCCC', pack 'N', $uv; + $val{ $c[1] }{ $c[2] }{ $c[3] } = $hash->{$uv}; + } + + foreach my $p (sort { $a <=> $b } keys %val) { + next if ! $val{ $p }; + for (my $r = 0; $r < 256; $r++) { + next if ! $val{ $p }{ $r }; + printf "static $type ${head}_%02x_%02x [256] = {\n", $p, $r; + for (my $c = 0; $c < 256; $c++) { + print "\t", defined $val{$p}{$r}{$c} + ? "($type)".$val{$p}{$r}{$c} + : $null; + print ',' if $c != 255; + print "\n" if $c % 8 == 7; + } + print "};\n\n"; + } + } + foreach my $p (sort { $a <=> $b } keys %val) { + next if ! $val{ $p }; + printf "static $type* ${head}_%02x [256] = {\n", $p; + for (my $r = 0; $r < 256; $r++) { + print $val{ $p }{ $r } + ? sprintf("${head}_%02x_%02x", $p, $r) + : "NULL"; + print ',' if $r != 255; + print "\n" if $val{ $p }{ $r } || ($r+1) % 8 == 0; + } + print "};\n\n"; + } + print "static $type** $head [] = {\n"; + for (my $p = 0; $p <= 0x10; $p++) { + print $val{ $p } ? sprintf("${head}_%02x", $p) : "NULL"; + print ',' if $p != 0x10; + print "\n"; + } + print "};\n\n"; + close FH; +} + +1; +__END__ diff --git a/cpan/Unicode-Collate/t/loc_test.t b/cpan/Unicode-Collate/t/loc_test.t index d1b5b4a..8d7d74a 100644 --- a/cpan/Unicode-Collate/t/loc_test.t +++ b/cpan/Unicode-Collate/t/loc_test.t @@ -12,7 +12,7 @@ BEGIN { } use Test; -BEGIN { plan tests => 116 }; +BEGIN { plan tests => 120 }; use strict; use warnings; @@ -127,3 +127,14 @@ our @sortFr = $objFr->sort(@randFr); ok("@sortFr" eq "@listFr"); # 116 + +{ + my $keyXS = '__useXS'; # see Unicode::Collate internal + my $noLoc = Unicode::Collate->new(normalization => undef); + my $UseXS = ref($noLoc->{$keyXS}); + ok(ref($Collator->{$keyXS}), $UseXS); + ok(ref($objFr ->{$keyXS}), $UseXS); + ok(ref($objEs ->{$keyXS}), $UseXS); + ok(ref($objEsT ->{$keyXS}), $UseXS); +} +# 120 diff --git a/pod/perldelta.pod b/pod/perldelta.pod index ce7efdf..2035921 100644 --- a/pod/perldelta.pod +++ b/pod/perldelta.pod @@ -96,6 +96,13 @@ XXX =item * +C has been upgraded from version 0.68 to 0.72 + +This also sees the switch from using the pure-perl version of this +module to the XS version.` + +=item * + XXX =back -- 2.7.4