From 5a449a8e553425ace4bea1ea13c16c03c6c5bf4c Mon Sep 17 00:00:00 2001 From: Chris 'BinGOs' Williams Date: Wed, 19 Jan 2011 14:14:46 +0000 Subject: [PATCH] Revert "Update Unicode-Collate to CPAN version 0.70 and enable XS version" This reverts commit 211cc5012284f4bd900fcaa630adbcac69ca6112. --- MANIFEST | 3 - cpan/Unicode-Collate/.gitignore | 1 - cpan/Unicode-Collate/Changes | 18 +- cpan/Unicode-Collate/Collate.pm | 360 ++++++++++++++--- cpan/Unicode-Collate/Collate.xs | 691 --------------------------------- cpan/Unicode-Collate/Collate/Locale.pm | 8 +- cpan/Unicode-Collate/Makefile.PL | 28 -- cpan/Unicode-Collate/README | 17 +- cpan/Unicode-Collate/mkheader | 196 ---------- cpan/Unicode-Collate/t/loc_test.t | 12 +- 10 files changed, 330 insertions(+), 1004 deletions(-) delete mode 100644 cpan/Unicode-Collate/.gitignore delete mode 100644 cpan/Unicode-Collate/Collate.xs delete mode 100644 cpan/Unicode-Collate/Makefile.PL delete mode 100644 cpan/Unicode-Collate/mkheader diff --git a/MANIFEST b/MANIFEST index a1e1124..aaf7227 100644 --- a/MANIFEST +++ b/MANIFEST @@ -2554,9 +2554,6 @@ cpan/Unicode-Collate/Collate/Locale/zh_pin.pl Unicode::Collate cpan/Unicode-Collate/Collate/Locale/zh.pl Unicode::Collate cpan/Unicode-Collate/Collate/Locale/zh_strk.pl Unicode::Collate cpan/Unicode-Collate/Collate.pm Unicode::Collate -cpan/Unicode-Collate/Collate.xs Unicode::Collate -cpan/Unicode-Collate/Makefile.PL Unicode::Collate -cpan/Unicode-Collate/mkheader Unicode::Collate cpan/Unicode-Collate/README Unicode::Collate cpan/Unicode-Collate/t/altern.t Unicode::Collate cpan/Unicode-Collate/t/backwds.t Unicode::Collate diff --git a/cpan/Unicode-Collate/.gitignore b/cpan/Unicode-Collate/.gitignore deleted file mode 100644 index 424c745..0000000 --- a/cpan/Unicode-Collate/.gitignore +++ /dev/null @@ -1 +0,0 @@ -*.h diff --git a/cpan/Unicode-Collate/Changes b/cpan/Unicode-Collate/Changes index c7bba12..ca9be54 100644 --- a/cpan/Unicode-Collate/Changes +++ b/cpan/Unicode-Collate/Changes @@ -1,13 +1,5 @@ Revision history for Perl module Unicode::Collate. -0.70 Sun Jan 16 20:31:07 2011 - - Now U::C::Locale->new will use the compiled DUCET via XS. - -0.69 Sat Jan 15 19:41:11 2011 - - clarified about XSUB. revised INSTALL in README. - - xs: flag passed to utf8n_to_uvuni(). - - doc and comments: [perl #81876] Fix typos by Peter J. Acklam. - 0.68 Tue Nov 23 20:17:22 2010 - doc: clarified about (backwards => [ ]) and (backwards => undef). - separated t/backwds.t from t/test.t. @@ -32,7 +24,7 @@ Revision history for Perl module Unicode::Collate. - 12 compat. ideographs (e.g. U+FA0E) are treated as unified ideographs. (though DUCET also does it, now Unicode::Collate does it without DUCET.) - added t/compatui.t. - ! Ideographs Ext.B (U+20000..U+2A6D6) can be overridden with UCA_Version 8. + ! Ideographs Ext.B (U+20000..U+2A6D6) can be overrided with UCA_Version 8. This is a long-standing behavior from Unicode::Collate 0.11 to 0.63. A wrong fix at 0.64 should be abandoned. @@ -129,8 +121,6 @@ Revision history for Perl module Unicode::Collate. - U+9FC4..U+9FCB and U+2A700..U+2B734 are new CJK unified ideographs. - Many hangul jamo are assigned (affecting hangul_terminator). - ! Now XSUB will be built by default. (XSUB needs a C compiler.) - To build pure perl, run disableXS before Makefile.PL. ! DUCET will be compiled when XS is used. Explicit saying 'allkeys.txt'> (or using another table) will prevent this module from using the compiled DUCET. @@ -184,11 +174,11 @@ Revision history for Perl module Unicode::Collate. (Perl 5.7.3 or before)). If perl 5.6.X is used, XSUB may help it in place of broken CORE::unpack('U*') in older perl. - added illegal.t and illegalp.t in t. - - added XSUB where some functions are implemented in XSUB. - Pure Perl is also supported. + - added XSUB (EXPERIMENTAL!) where some functions are implemented + in XSUB. Pure Perl is also supported. 0.30 Mon Oct 13 21:26:37 2003 - - fix: Completely ignorable in table should be able to be overridden + - fix: Completely ignorable in table should be able to be overrided by non-ignorable in entry. - fix: Maximum length for contraction must not be shortened by a shorter contraction following in table and/or entry. diff --git a/cpan/Unicode-Collate/Collate.pm b/cpan/Unicode-Collate/Collate.pm index 05822b2..b337b6f 100644 --- a/cpan/Unicode-Collate/Collate.pm +++ b/cpan/Unicode-Collate/Collate.pm @@ -14,13 +14,9 @@ use File::Spec; no warnings 'utf8'; -our $VERSION = '0.70'; +our $VERSION = '0.6801'; our $PACKAGE = __PACKAGE__; -require DynaLoader; -our @ISA = qw(DynaLoader); -bootstrap Unicode::Collate $VERSION; - my @Path = qw(Unicode Collate); my $KeyFile = "allkeys.txt"; @@ -75,8 +71,49 @@ use constant NON_VAR => 0; # Non-Variable character use constant VAR => 1; # Variable character # specific code points +use constant Hangul_SBase => 0xAC00; use constant Hangul_SIni => 0xAC00; use constant Hangul_SFin => 0xD7A3; +use constant Hangul_NCount => 588; +use constant Hangul_TCount => 28; +use constant Hangul_LBase => 0x1100; +use constant Hangul_LIni => 0x1100; +use constant Hangul_LFin => 0x1159; +use constant Hangul_LFill => 0x115F; +use constant Hangul_LEnd => 0x115F; # Unicode 5.2 +use constant Hangul_VBase => 0x1161; +use constant Hangul_VIni => 0x1160; # from Vowel Filler +use constant Hangul_VFin => 0x11A2; +use constant Hangul_VEnd => 0x11A7; # Unicode 5.2 +use constant Hangul_TBase => 0x11A7; # from "no-final" codepoint +use constant Hangul_TIni => 0x11A8; +use constant Hangul_TFin => 0x11F9; +use constant Hangul_TEnd => 0x11FF; # Unicode 5.2 +use constant HangulL2Ini => 0xA960; # Unicode 5.2 +use constant HangulL2Fin => 0xA97C; # Unicode 5.2 +use constant HangulV2Ini => 0xD7B0; # Unicode 5.2 +use constant HangulV2Fin => 0xD7C6; # Unicode 5.2 +use constant HangulT2Ini => 0xD7CB; # Unicode 5.2 +use constant HangulT2Fin => 0xD7FB; # Unicode 5.2 + +use constant CJK_UidIni => 0x4E00; +use constant CJK_UidFin => 0x9FA5; +use constant CJK_UidF41 => 0x9FBB; +use constant CJK_UidF51 => 0x9FC3; +use constant CJK_UidF52 => 0x9FCB; +use constant CJK_ExtAIni => 0x3400; # Unicode 3.0 +use constant CJK_ExtAFin => 0x4DB5; # Unicode 3.0 +use constant CJK_ExtBIni => 0x20000; # Unicode 3.1 +use constant CJK_ExtBFin => 0x2A6D6; # Unicode 3.1 +use constant CJK_ExtCIni => 0x2A700; # Unicode 5.2 +use constant CJK_ExtCFin => 0x2B734; # Unicode 5.2 +use constant CJK_ExtDIni => 0x2B740; # Unicode 6.0 +use constant CJK_ExtDFin => 0x2B81D; # Unicode 6.0 + +my %CompatUI = map +($_ => 1), ( + 0xFA0E, 0xFA0F, 0xFA11, 0xFA13, 0xFA14, 0xFA1F, + 0xFA21, 0xFA23, 0xFA24, 0xFA27, 0xFA28, 0xFA29, +); # Logical_Order_Exception in PropList.txt my $DefaultRearrange = [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ]; @@ -91,6 +128,10 @@ sub pack_U { return pack('U*', @_); } +sub unpack_U { + return unpack('U*', shift(@_).pack('U*')); +} + ###### my (%VariableOK); @@ -111,7 +152,6 @@ our @ChangeNG = qw/ versionTable alternateTable backwardsTable forwardsTable rearrangeTable derivCode normCode rearrangeHash backwardsFlag suppress suppressHash - __useXS /; # The hash key 'ignored' is deleted at v 0.21. # The hash key 'isShift' is deleted at v 0.23. @@ -245,12 +285,6 @@ sub new my $class = shift; my $self = bless { @_ }, $class; - if (! exists $self->{table} && - !defined $self->{undefName} && !defined $self->{ignoreName} && - !defined $self->{undefChar} && !defined $self->{ignoreChar}) { - $self->{__useXS} = \&_fetch_simple; - } # XS only - # keys of $self->{suppressHash} are $self->{suppress}. if ($self->{suppress} && @{ $self->{suppress} }) { @{ $self->{suppressHash} }{ @{ $self->{suppress} } } = (); @@ -313,20 +347,6 @@ sub parseAtmark { sub read_table { my $self = shift; - if ($self->{__useXS}) { - my @rest = _fetch_rest(); # complex matter need to parse - for my $line (@rest) { - next if $line =~ /^\s*#/; - - if ($line =~ s/^\s*\@//) { - $self->parseAtmark($line); - } else { - $self->parseEntry($line); - } - } - return; - } - my($f, $fh); foreach my $d (@INC) { $f = File::Spec->catfile($d, @Path, $self->{table}); @@ -425,12 +445,50 @@ sub parseEntry } +## +## VCE = _varCE(variable, VCE) +## +sub _varCE +{ + my $vbl = shift; + my $vce = shift; + if ($vbl eq 'non-ignorable') { + return $vce; + } + my ($var, @wt) = unpack VCE_TEMPLATE, $vce; + + if ($var) { + return pack(VCE_TEMPLATE, $var, 0, 0, 0, + $vbl eq 'blanked' ? $wt[3] : $wt[0]); + } + elsif ($vbl eq 'blanked') { + return $vce; + } + else { + return pack(VCE_TEMPLATE, $var, @wt[0..2], + $vbl eq 'shifted' && $wt[0]+$wt[1]+$wt[2] ? Shift4Wt : 0); + } +} + sub viewSortKey { my $self = shift; $self->visualizeSortKey($self->getSortKey(@_)); } +sub visualizeSortKey +{ + my $self = shift; + my $view = join " ", map sprintf("%04X", $_), unpack(KEY_TEMPLATE, shift); + + if ($self->{UCA_Version} <= 8) { + $view =~ s/ ?0000 ?/|/g; + } else { + $view =~ s/\b0000\b/|/g; + } + return "[$view]"; +} + ## ## arrayref of JCPS = splitEnt(string to be collated) @@ -448,7 +506,6 @@ sub splitEnt my $reH = $self->{rearrangeHash}; my $vers = $self->{UCA_Version}; my $ver9 = $vers >= 9 && $vers <= 11; - my $uXS = $self->{__useXS}; my ($str, @buf); @@ -487,9 +544,6 @@ sub splitEnt } elsif ($ver9) { $src[$i] = undef if $map->{ $src[$i] } && @{ $map->{ $src[$i] } } == 0; - if ($uXS) { - $src[$i] = undef if _ignorable_simple($src[$i]); - } } } @@ -569,8 +623,7 @@ sub splitEnt } # skip completely ignorable - if ($uXS && $jcps =~ /^[0-9]+\z/ && _ignorable_simple($jcps) || - $map->{$jcps} && @{ $map->{$jcps} } == 0) { + if ($map->{$jcps} && @{ $map->{$jcps} } == 0) { if ($wLen && @buf) { $buf[-1][2] = $i + 1; } @@ -609,13 +662,10 @@ sub getWt my $vbl = $self->{variable}; my $map = $self->{mapping}; my $der = $self->{derivCode}; - my $uXS = $self->{__useXS}; return if !defined $u; return map(_varCE($vbl, $_), @{ $map->{$u} }) if $map->{$u}; - return map(_varCE($vbl, $_), _fetch_simple($u)) - if $uXS && _exists_simple($u); # JCPS must not be a contraction, then it's a code point. if (Hangul_SIni <= $u && $u <= Hangul_SFin) { @@ -642,7 +692,7 @@ sub getWt $map->{$contract} and @decH = ($contract, $decH[2]); } # even if V's ignorable, LT contraction is not supported. - # If such a situation were required, NFD should be used. + # If such a situatution were required, NFD should be used. } if (@decH == 3 && $max->{$decH[1]}) { my $contract = join(CODE_SEP, @decH[1,2]); @@ -651,9 +701,7 @@ sub getWt } @hangulCE = map({ - $map->{$_} ? @{ $map->{$_} } : - $uXS && _exists_simple($_) ? _fetch_simple($_) : - $der->($_); + $map->{$_} ? @{ $map->{$_} } : $der->($_); } @decH); } return map _varCE($vbl, $_), @hangulCE; @@ -678,10 +726,12 @@ sub getWt sub getSortKey { my $self = shift; + my $lev = $self->{level}; my $rEnt = $self->splitEnt(shift); # get an arrayref of JCPS my $vers = $self->{UCA_Version}; my $vbl = $self->{variable}; my $term = $self->{hangul_terminator}; + my $v2i = $vers >= 9 && $vbl ne 'non-ignorable'; my @buf; # weight arrays if ($term) { @@ -706,7 +756,53 @@ sub getSortKey } } - return $self->mk_SortKey(\@buf); + # make sort key + my @ret = ([],[],[],[]); + my $last_is_variable; + + foreach my $vwt (@buf) { + my($var, @wt) = unpack(VCE_TEMPLATE, $vwt); + + # "Ignorable (L1, L2) after Variable" since track. v. 9 + if ($v2i) { + if ($var) { + $last_is_variable = TRUE; + } elsif (!$wt[0]) { # ignorable + next if $last_is_variable; + } else { + $last_is_variable = FALSE; + } + } + foreach my $v (0..$lev-1) { + 0 < $wt[$v] and push @{ $ret[$v] }, $wt[$v]; + } + } + + # modification of tertiary weights + if ($self->{upper_before_lower}) { + foreach my $w (@{ $ret[2] }) { + if (0x8 <= $w && $w <= 0xC) { $w -= 6 } # lower + elsif (0x2 <= $w && $w <= 0x6) { $w += 6 } # upper + elsif ($w == 0x1C) { $w += 1 } # square upper + elsif ($w == 0x1D) { $w -= 1 } # square lower + } + } + if ($self->{katakana_before_hiragana}) { + foreach my $w (@{ $ret[2] }) { + if (0x0F <= $w && $w <= 0x13) { $w -= 2 } # katakana + elsif (0x0D <= $w && $w <= 0x0E) { $w += 5 } # hiragana + } + } + + if ($self->{backwardsFlag}) { + for (my $v = MinLevel; $v <= MaxLevel; $v++) { + if ($self->{backwardsFlag} & (1 << $v)) { + @{ $ret[$v-1] } = reverse @{ $ret[$v-1] }; + } + } + } + + join LEVEL_SEP, map pack(KEY_TEMPLATE, @$_), @ret; } @@ -733,6 +829,174 @@ sub sort { } +sub _derivCE_22 { + my $u = shift; + my $base = (CJK_UidIni <= $u && $u <= CJK_UidF52 || $CompatUI{$u}) + ? 0xFB40 : # CJK + (CJK_ExtAIni <= $u && $u <= CJK_ExtAFin || + CJK_ExtBIni <= $u && $u <= CJK_ExtBFin || + CJK_ExtCIni <= $u && $u <= CJK_ExtCFin || + CJK_ExtDIni <= $u && $u <= CJK_ExtDFin) + ? 0xFB80 # CJK ext. + : 0xFBC0; # others + my $aaaa = $base + ($u >> 15); + my $bbbb = ($u & 0x7FFF) | 0x8000; + return pack(VCE_TEMPLATE, NON_VAR, $aaaa, Min2Wt, Min3Wt, $u), + pack(VCE_TEMPLATE, NON_VAR, $bbbb, 0, 0, $u); +} + +sub _derivCE_20 { + my $u = shift; + my $base = (CJK_UidIni <= $u && $u <= CJK_UidF52 || $CompatUI{$u}) + ? 0xFB40 : # CJK + (CJK_ExtAIni <= $u && $u <= CJK_ExtAFin || + CJK_ExtBIni <= $u && $u <= CJK_ExtBFin || + CJK_ExtCIni <= $u && $u <= CJK_ExtCFin) + ? 0xFB80 # CJK ext. + : 0xFBC0; # others + my $aaaa = $base + ($u >> 15); + my $bbbb = ($u & 0x7FFF) | 0x8000; + return pack(VCE_TEMPLATE, NON_VAR, $aaaa, Min2Wt, Min3Wt, $u), + pack(VCE_TEMPLATE, NON_VAR, $bbbb, 0, 0, $u); +} + +sub _derivCE_18 { + my $u = shift; + my $base = (CJK_UidIni <= $u && $u <= CJK_UidF51 || $CompatUI{$u}) + ? 0xFB40 : # CJK + (CJK_ExtAIni <= $u && $u <= CJK_ExtAFin || + CJK_ExtBIni <= $u && $u <= CJK_ExtBFin) + ? 0xFB80 # CJK ext. + : 0xFBC0; # others + my $aaaa = $base + ($u >> 15); + my $bbbb = ($u & 0x7FFF) | 0x8000; + return pack(VCE_TEMPLATE, NON_VAR, $aaaa, Min2Wt, Min3Wt, $u), + pack(VCE_TEMPLATE, NON_VAR, $bbbb, 0, 0, $u); +} + +sub _derivCE_14 { + my $u = shift; + my $base = (CJK_UidIni <= $u && $u <= CJK_UidF41 || $CompatUI{$u}) + ? 0xFB40 : # CJK + (CJK_ExtAIni <= $u && $u <= CJK_ExtAFin || + CJK_ExtBIni <= $u && $u <= CJK_ExtBFin) + ? 0xFB80 # CJK ext. + : 0xFBC0; # others + my $aaaa = $base + ($u >> 15); + my $bbbb = ($u & 0x7FFF) | 0x8000; + return pack(VCE_TEMPLATE, NON_VAR, $aaaa, Min2Wt, Min3Wt, $u), + pack(VCE_TEMPLATE, NON_VAR, $bbbb, 0, 0, $u); +} + +sub _derivCE_9 { + my $u = shift; + my $base = (CJK_UidIni <= $u && $u <= CJK_UidFin || $CompatUI{$u}) + ? 0xFB40 : # CJK + (CJK_ExtAIni <= $u && $u <= CJK_ExtAFin || + CJK_ExtBIni <= $u && $u <= CJK_ExtBFin) + ? 0xFB80 # CJK ext. + : 0xFBC0; # others + my $aaaa = $base + ($u >> 15); + my $bbbb = ($u & 0x7FFF) | 0x8000; + return pack(VCE_TEMPLATE, NON_VAR, $aaaa, Min2Wt, Min3Wt, $u), + pack(VCE_TEMPLATE, NON_VAR, $bbbb, 0, 0, $u); +} + +sub _derivCE_8 { + my $code = shift; + my $aaaa = 0xFF80 + ($code >> 15); + my $bbbb = ($code & 0x7FFF) | 0x8000; + return pack(VCE_TEMPLATE, NON_VAR, $aaaa, 2, 1, $code), + pack(VCE_TEMPLATE, NON_VAR, $bbbb, 0, 0, $code); +} + +sub _uideoCE_8 { + my $u = shift; + return pack(VCE_TEMPLATE, NON_VAR, $u, Min2Wt, Min3Wt, $u); +} + +sub _isUIdeo { + # $uca_vers = 0 for _uideoCE_8() + my ($u, $uca_vers) = @_; + return((CJK_UidIni <= $u && ( + $uca_vers >= 20 ? ($u <= CJK_UidF52) : + $uca_vers >= 18 ? ($u <= CJK_UidF51) : + $uca_vers >= 14 ? ($u <= CJK_UidF41) : + ($u <= CJK_UidFin))) || $CompatUI{$u} + || + (CJK_ExtAIni <= $u && $u <= CJK_ExtAFin) + || + ($uca_vers >= 8 && CJK_ExtBIni <= $u && $u <= CJK_ExtBFin) + || + ($uca_vers >= 20 && CJK_ExtCIni <= $u && $u <= CJK_ExtCFin) + || + ($uca_vers >= 22 && CJK_ExtDIni <= $u && $u <= CJK_ExtDFin) + ); +} + + +## +## "hhhh hhhh hhhh" to (dddd, dddd, dddd) +## +sub _getHexArray { map hex, $_[0] =~ /([0-9a-fA-F]+)/g } + +# +# $code *must* be in Hangul syllable. +# Check it before you enter here. +# +sub _decompHangul { + my $code = shift; + my $si = $code - Hangul_SBase; + my $li = int( $si / Hangul_NCount); + my $vi = int(($si % Hangul_NCount) / Hangul_TCount); + my $ti = $si % Hangul_TCount; + return ( + Hangul_LBase + $li, + Hangul_VBase + $vi, + $ti ? (Hangul_TBase + $ti) : (), + ); +} + +sub _isIllegal { + my $code = shift; + return((! defined $code) # removed + || ($code < 0 || 0x10FFFF < $code) # out of range + ); +} + +sub _isNonchar { + my $code = shift; + return((($code & 0xFFFE) == 0xFFFE) # ??FFF[EF] (cf. utf8.c) + || (0xD800 <= $code && $code <= 0xDFFF) # unpaired surrogates + || (0xFDD0 <= $code && $code <= 0xFDEF) # other non-characters + ); +} + +# Hangul Syllable Type +sub getHST { + my $u = shift; + my $vers = shift || 0; + + if (Hangul_SIni <= $u && $u <= Hangul_SFin) { + return +($u - Hangul_SBase) % Hangul_TCount ? "LVT" : "LV"; + } + + if ($vers < 20) { + return Hangul_LIni <= $u && $u <= Hangul_LFin || + $u == Hangul_LFill ? "L" : + Hangul_VIni <= $u && $u <= Hangul_VFin ? "V" : + Hangul_TIni <= $u && $u <= Hangul_TFin ? "T" : ""; + } else { + return Hangul_LIni <= $u && $u <= Hangul_LEnd || + HangulL2Ini <= $u && $u <= HangulL2Fin ? "L" : + Hangul_VIni <= $u && $u <= Hangul_VEnd || + HangulV2Ini <= $u && $u <= HangulV2Fin ? "V" : + Hangul_TIni <= $u && $u <= Hangul_TEnd || + HangulT2Ini <= $u && $u <= HangulT2Fin ? "T" : ""; + } +} + + ## ## bool _nonIgnorAtLevel(arrayref weights, int level) ## @@ -759,7 +1023,7 @@ sub _eqArray($$$) my $lev = shift; for my $g (0..@$substr-1){ - # Do the $g'th graphemes have the same number of AV weights? + # Do the $g'th graphemes have the same number of AV weigths? return if @{ $source->[$g] } != @{ $substr->[$g] }; for my $w (0..@{ $substr->[$g] }-1) { @@ -1057,7 +1321,7 @@ The following tracking versions are supported. The default is 20. Note: Recent UTS #10 renames "Tracking Version" to "Revision." -* Noncharacters (e.g. U+FFFF) are not ignored, and can be overridden +* Noncharacters (e.g. U+FFFF) are not ignored, and can be overrided since C 22. * Fully ignorable characters were ignored, and would not interrupt @@ -1095,7 +1359,7 @@ forwards at all the levels. If the same character (or a sequence of characters) exists in the collation element table through C
, -mapping to collation elements is overridden. +mapping to collation elements is overrided. If it does not exist, the mapping is defined additionally. entry => <<'ENTRY', # for DUCET v4.0.0 (allkeys-4.0.0.txt) @@ -1272,7 +1536,7 @@ those in the CJK Unified Ideographs Extension A etc. Ext.D (U+2B740..U+2B81D) if UCA_Version is 22 or greater. Through C, ordering of CJK unified ideographs (including -extensions) can be overridden. +extensions) can be overrided. ex. CJK unified ideographs in the JIS code point order. @@ -1315,7 +1579,7 @@ in C
or C is still valid. B In addition to them, 12 CJK compatibility ideographs (C, C, C, C, C, C, C, C, C, C, C, C) are also treated as CJK unified -ideographs. But they can't be overridden via C when you use +ideographs. But they can't be overrided via C when you use DUCET, as the table includes weights for them. C
or C has priority over C. @@ -1325,7 +1589,7 @@ priority over C. By default, Hangul syllables are decomposed into Hangul Jamo, even if C<(normalization =E undef)>. -But the mapping of Hangul syllables may be overridden. +But the mapping of Hangul syllables may be overrided. This parameter works like C, so see there for examples. @@ -1486,7 +1750,7 @@ this parameter doesn't work validly. This key allows to variable weighting for variable collation elements, which are marked with an ASTERISK in the table -(NOTE: Many punctuation marks and symbols are variable in F). +(NOTE: Many punction marks and symbols are variable in F). variable => 'blanked', 'non-ignorable', 'shifted', or 'shift-trimmed'. @@ -1794,7 +2058,7 @@ B =head1 AUTHOR, COPYRIGHT AND LICENSE The Unicode::Collate module for perl was written by SADAHIRO Tomoyuki, -. This module is Copyright(C) 2001-2011, +. This module is Copyright(C) 2001-2010, SADAHIRO Tomoyuki. Japan. All rights reserved. This module is free software; you can redistribute it and/or diff --git a/cpan/Unicode-Collate/Collate.xs b/cpan/Unicode-Collate/Collate.xs deleted file mode 100644 index d6004bd..0000000 --- a/cpan/Unicode-Collate/Collate.xs +++ /dev/null @@ -1,691 +0,0 @@ -#include "EXTERN.h" -#include "perl.h" -#include "XSUB.h" - -/* This file is prepared by mkheader */ -#include "ucatbl.h" - -/* Perl 5.6.1 ? */ -#ifndef utf8n_to_uvuni -#define utf8n_to_uvuni utf8_to_uv -#endif /* utf8n_to_uvuni */ - -/* UTF8_ALLOW_BOM is used before Perl 5.8.0 */ -#ifndef UTF8_ALLOW_BOM -#define UTF8_ALLOW_BOM (0) -#endif /* UTF8_ALLOW_BOM */ - -#ifndef UTF8_ALLOW_SURROGATE -#define UTF8_ALLOW_SURROGATE (0) -#endif /* UTF8_ALLOW_SURROGATE */ - -#ifndef UTF8_ALLOW_FE_FF -#define UTF8_ALLOW_FE_FF (0) -#endif /* UTF8_ALLOW_FE_FF */ - -#ifndef UTF8_ALLOW_FFFF -#define UTF8_ALLOW_FFFF (0) -#endif /* UTF8_ALLOW_FFFF */ - -#define AllowAnyUTF (UTF8_ALLOW_SURROGATE|UTF8_ALLOW_BOM|UTF8_ALLOW_FE_FF|UTF8_ALLOW_FFFF) - -/* if utf8n_to_uvuni() sets retlen to 0 (?) */ -#define ErrRetlenIsZero "panic (Unicode::Collate): zero-length character" - -/* At present, char > 0x10ffff are unaffected without complaint, right? */ -#define VALID_UTF_MAX (0x10ffff) -#define OVER_UTF_MAX(uv) (VALID_UTF_MAX < (uv)) - -static const UV max_div_16 = UV_MAX / 16; - -/* Supported Levels */ -#define MinLevel (1) -#define MaxLevel (4) - -/* Shifted weight at 4th level */ -#define Shift4Wt (0xFFFF) - -#define VCE_Length (9) - -#define Hangul_SBase (0xAC00) -#define Hangul_SIni (0xAC00) -#define Hangul_SFin (0xD7A3) -#define Hangul_NCount (588) -#define Hangul_TCount (28) -#define Hangul_LBase (0x1100) -#define Hangul_LIni (0x1100) -#define Hangul_LFin (0x1159) -#define Hangul_LFill (0x115F) -#define Hangul_LEnd (0x115F) /* Unicode 5.2 */ -#define Hangul_VBase (0x1161) -#define Hangul_VIni (0x1160) /* from Vowel Filler */ -#define Hangul_VFin (0x11A2) -#define Hangul_VEnd (0x11A7) /* Unicode 5.2 */ -#define Hangul_TBase (0x11A7) /* from "no-final" codepoint */ -#define Hangul_TIni (0x11A8) -#define Hangul_TFin (0x11F9) -#define Hangul_TEnd (0x11FF) /* Unicode 5.2 */ -#define HangulL2Ini (0xA960) /* Unicode 5.2 */ -#define HangulL2Fin (0xA97C) /* Unicode 5.2 */ -#define HangulV2Ini (0xD7B0) /* Unicode 5.2 */ -#define HangulV2Fin (0xD7C6) /* Unicode 5.2 */ -#define HangulT2Ini (0xD7CB) /* Unicode 5.2 */ -#define HangulT2Fin (0xD7FB) /* Unicode 5.2 */ - -#define CJK_UidIni (0x4E00) -#define CJK_UidFin (0x9FA5) -#define CJK_UidF41 (0x9FBB) -#define CJK_UidF51 (0x9FC3) -#define CJK_UidF52 (0x9FCB) -#define CJK_ExtAIni (0x3400) /* Unicode 3.0 */ -#define CJK_ExtAFin (0x4DB5) /* Unicode 3.0 */ -#define CJK_ExtBIni (0x20000) /* Unicode 3.1 */ -#define CJK_ExtBFin (0x2A6D6) /* Unicode 3.1 */ -#define CJK_ExtCIni (0x2A700) /* Unicode 5.2 */ -#define CJK_ExtCFin (0x2B734) /* Unicode 5.2 */ -#define CJK_ExtDIni (0x2B740) /* Unicode 6.0 */ -#define CJK_ExtDFin (0x2B81D) /* Unicode 6.0 */ - -static STDCHAR UnifiedCompat[] = { - 1,1,0,1,0,1,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,1,0,0,1,1,1 -}; /* E F 0 1 2 3 4 5 6 7 8 9 A B C D E F 0 1 2 3 4 5 6 7 8 9 */ - -#define codeRange(bcode, ecode) ((bcode) <= code && code <= (ecode)) - -MODULE = Unicode::Collate PACKAGE = Unicode::Collate - -PROTOTYPES: DISABLE - -void -_fetch_rest () - PREINIT: - char ** rest; - PPCODE: - for (rest = UCA_rest; *rest; ++rest) { - XPUSHs(sv_2mortal(newSVpv((char *) *rest, 0))); - } - - -void -_fetch_simple (uv) - UV uv - PREINIT: - U8 ***plane, **row; - char* result = NULL; - PPCODE: - if (!OVER_UTF_MAX(uv)){ - plane = (U8***)UCA_simple[uv >> 16]; - if (plane) { - row = plane[(uv >> 8) & 0xff]; - result = row ? row[uv & 0xff] : NULL; - } - } - if (result) { - int i; - int num = (int)*result; - ++result; - for (i = 0; i < num; ++i) { - XPUSHs(sv_2mortal(newSVpvn((char *) result, VCE_Length))); - result += VCE_Length; - } - } else { - XPUSHs(sv_2mortal(newSViv(0))); - } - -SV* -_ignorable_simple (uv) - UV uv - ALIAS: - _exists_simple = 1 - PREINIT: - U8 ***plane, **row; - int num = -1; - char* result = NULL; - CODE: - if (!OVER_UTF_MAX(uv)){ - plane = (U8***)UCA_simple[uv >> 16]; - if (plane) { - row = plane[(uv >> 8) & 0xff]; - result = row ? row[uv & 0xff] : NULL; - } - if (result) - num = (int)*result; /* assuming 0 <= num < 128 */ - } - - if (ix) - RETVAL = boolSV(num >0); - else - RETVAL = boolSV(num==0); - OUTPUT: - RETVAL - - -void -_getHexArray (src) - SV* src - PREINIT: - char *s, *e; - STRLEN byte; - UV value; - bool overflowed = FALSE; - const char *hexdigit; - PPCODE: - s = SvPV(src,byte); - for (e = s + byte; s < e;) { - hexdigit = strchr((char *) PL_hexdigit, *s++); - if (! hexdigit) - continue; - value = (hexdigit - PL_hexdigit) & 0xF; - while (*s) { - hexdigit = strchr((char *) PL_hexdigit, *s++); - if (! hexdigit) - break; - if (overflowed) - continue; - if (value > max_div_16) { - overflowed = TRUE; - continue; - } - value = (value << 4) | ((hexdigit - PL_hexdigit) & 0xF); - } - XPUSHs(sv_2mortal(newSVuv(overflowed ? UV_MAX : value))); - } - - -SV* -_isIllegal (sv) - SV* sv - PREINIT: - UV uv; - CODE: - if (!sv || !SvIOK(sv)) - XSRETURN_YES; - uv = SvUVX(sv); - RETVAL = boolSV( - 0x10FFFF < uv /* out of range */ - ); -OUTPUT: - RETVAL - - -SV* -_isNonchar (sv) - SV* sv - PREINIT: - UV uv; - CODE: - /* should be called only if ! _isIllegal(sv). */ - uv = SvUVX(sv); - RETVAL = boolSV( - ((uv & 0xFFFE) == 0xFFFE) /* ??FFF[EF] (cf. utf8.c) */ - || (0xD800 <= uv && uv <= 0xDFFF) /* unpaired surrogates */ - || (0xFDD0 <= uv && uv <= 0xFDEF) /* other non-characters */ - ); -OUTPUT: - RETVAL - - -void -_decompHangul (code) - UV code - PREINIT: - UV sindex, lindex, vindex, tindex; - PPCODE: - /* code *must* be in Hangul syllable. - * Check it before you enter here. */ - sindex = code - Hangul_SBase; - lindex = sindex / Hangul_NCount; - vindex = (sindex % Hangul_NCount) / Hangul_TCount; - tindex = sindex % Hangul_TCount; - - XPUSHs(sv_2mortal(newSVuv(lindex + Hangul_LBase))); - XPUSHs(sv_2mortal(newSVuv(vindex + Hangul_VBase))); - if (tindex) - XPUSHs(sv_2mortal(newSVuv(tindex + Hangul_TBase))); - - -SV* -getHST (code, uca_vers = 0) - UV code; - IV uca_vers; - PREINIT: - char * hangtype; - STRLEN typelen; - CODE: - if (codeRange(Hangul_SIni, Hangul_SFin)) { - if ((code - Hangul_SBase) % Hangul_TCount) { - hangtype = "LVT"; typelen = 3; - } else { - hangtype = "LV"; typelen = 2; - } - } else if (uca_vers < 20) { - if (codeRange(Hangul_LIni, Hangul_LFin) || code == Hangul_LFill) { - hangtype = "L"; typelen = 1; - } else if (codeRange(Hangul_VIni, Hangul_VFin)) { - hangtype = "V"; typelen = 1; - } else if (codeRange(Hangul_TIni, Hangul_TFin)) { - hangtype = "T"; typelen = 1; - } else { - hangtype = ""; typelen = 0; - } - } else { - if (codeRange(Hangul_LIni, Hangul_LEnd) || - codeRange(HangulL2Ini, HangulL2Fin)) { - hangtype = "L"; typelen = 1; - } else if (codeRange(Hangul_VIni, Hangul_VEnd) || - codeRange(HangulV2Ini, HangulV2Fin)) { - hangtype = "V"; typelen = 1; - } else if (codeRange(Hangul_TIni, Hangul_TEnd) || - codeRange(HangulT2Ini, HangulT2Fin)) { - hangtype = "T"; typelen = 1; - } else { - hangtype = ""; typelen = 0; - } - } - - RETVAL = newSVpvn(hangtype, typelen); -OUTPUT: - RETVAL - - -void -_derivCE_9 (code) - UV code - ALIAS: - _derivCE_14 = 1 - _derivCE_18 = 2 - _derivCE_20 = 3 - _derivCE_22 = 4 - PREINIT: - UV base, aaaa, bbbb; - U8 a[VCE_Length + 1] = "\x00\xFF\xFF\x00\x20\x00\x02\xFF\xFF"; - U8 b[VCE_Length + 1] = "\x00\xFF\xFF\x00\x00\x00\x00\xFF\xFF"; - bool basic_unified = 0; - PPCODE: - if (CJK_UidIni <= code) { - if (codeRange(0xFA0E, 0xFA29)) - basic_unified = (bool)UnifiedCompat[code - 0xFA0E]; - else - basic_unified = (ix >= 3 ? (code <= CJK_UidF52) : - ix == 2 ? (code <= CJK_UidF51) : - ix == 1 ? (code <= CJK_UidF41) : - (code <= CJK_UidFin)); - } - base = (basic_unified) - ? 0xFB40 : /* CJK */ - ((codeRange(CJK_ExtAIni, CJK_ExtAFin)) - || - (codeRange(CJK_ExtBIni, CJK_ExtBFin)) - || - (ix >= 3 && codeRange(CJK_ExtCIni, CJK_ExtCFin)) - || - (ix >= 4 && codeRange(CJK_ExtDIni, CJK_ExtDFin))) - ? 0xFB80 /* CJK ext. */ - : 0xFBC0; /* others */ - aaaa = base + (code >> 15); - bbbb = (code & 0x7FFF) | 0x8000; - a[1] = (U8)(aaaa >> 8); - a[2] = (U8)(aaaa & 0xFF); - b[1] = (U8)(bbbb >> 8); - b[2] = (U8)(bbbb & 0xFF); - a[7] = b[7] = (U8)(code >> 8); - a[8] = b[8] = (U8)(code & 0xFF); - XPUSHs(sv_2mortal(newSVpvn((char *) a, VCE_Length))); - XPUSHs(sv_2mortal(newSVpvn((char *) b, VCE_Length))); - - -void -_derivCE_8 (code) - UV code - PREINIT: - UV aaaa, bbbb; - U8 a[VCE_Length + 1] = "\x00\xFF\xFF\x00\x02\x00\x01\xFF\xFF"; - U8 b[VCE_Length + 1] = "\x00\xFF\xFF\x00\x00\x00\x00\xFF\xFF"; - PPCODE: - aaaa = 0xFF80 + (code >> 15); - bbbb = (code & 0x7FFF) | 0x8000; - a[1] = (U8)(aaaa >> 8); - a[2] = (U8)(aaaa & 0xFF); - b[1] = (U8)(bbbb >> 8); - b[2] = (U8)(bbbb & 0xFF); - a[7] = b[7] = (U8)(code >> 8); - a[8] = b[8] = (U8)(code & 0xFF); - XPUSHs(sv_2mortal(newSVpvn((char *) a, VCE_Length))); - XPUSHs(sv_2mortal(newSVpvn((char *) b, VCE_Length))); - - -void -_uideoCE_8 (code) - UV code - PREINIT: - U8 uice[VCE_Length + 1] = "\x00\xFF\xFF\x00\x20\x00\x02\xFF\xFF"; - PPCODE: - uice[1] = uice[7] = (U8)(code >> 8); - uice[2] = uice[8] = (U8)(code & 0xFF); - XPUSHs(sv_2mortal(newSVpvn((char *) uice, VCE_Length))); - - -SV* -_isUIdeo (code, uca_vers) - UV code; - IV uca_vers; - bool basic_unified = 0; - CODE: - /* uca_vers = 0 for _uideoCE_8() */ - if (CJK_UidIni <= code) { - if (codeRange(0xFA0E, 0xFA29)) - basic_unified = (bool)UnifiedCompat[code - 0xFA0E]; - else - basic_unified = (uca_vers >= 20 ? (code <= CJK_UidF52) : - uca_vers >= 18 ? (code <= CJK_UidF51) : - uca_vers >= 14 ? (code <= CJK_UidF41) : - (code <= CJK_UidFin)); - } - RETVAL = boolSV( - (basic_unified) - || - (codeRange(CJK_ExtAIni, CJK_ExtAFin)) - || - (uca_vers >= 8 && codeRange(CJK_ExtBIni, CJK_ExtBFin)) - || - (uca_vers >= 20 && codeRange(CJK_ExtCIni, CJK_ExtCFin)) - || - (uca_vers >= 22 && codeRange(CJK_ExtDIni, CJK_ExtDFin)) - ); -OUTPUT: - RETVAL - - -SV* -mk_SortKey (self, buf) - SV* self; - SV* buf; - PREINIT: - SV *dst, **svp; - STRLEN dlen, vlen; - U8 *d, *p, *e, *v, *s[MaxLevel], *eachlevel[MaxLevel]; - AV *bufAV; - HV *selfHV; - UV back_flag; - I32 i, buf_len; - IV lv, level, uca_vers; - bool upper_lower, kata_hira, v2i, last_is_var; - CODE: - if (SvROK(self) && SvTYPE(SvRV(self)) == SVt_PVHV) - selfHV = (HV*)SvRV(self); - else - croak("$self is not a HASHREF."); - - svp = hv_fetch(selfHV, "level", 5, FALSE); - level = svp ? SvIV(*svp) : MaxLevel; - - if (SvROK(buf) && SvTYPE(SvRV(buf)) == SVt_PVAV) - bufAV = (AV*)SvRV(buf); - else - croak("XSUB, not an ARRAYREF."); - - buf_len = av_len(bufAV); - - if (buf_len < 0) { /* empty: -1 */ - dlen = 2 * (MaxLevel - 1); - dst = newSV(dlen); - (void)SvPOK_only(dst); - d = SvPVX(dst); - while (dlen--) - *d++ = '\0'; - } - else { - for (lv = 0; lv < level; lv++) { - New(0, eachlevel[lv], 2 * (1 + buf_len) + 1, U8); - s[lv] = eachlevel[lv]; - } - - svp = hv_fetch(selfHV, "upper_before_lower", 18, FALSE); - upper_lower = svp ? SvTRUE(*svp) : FALSE; - svp = hv_fetch(selfHV, "katakana_before_hiragana", 24, FALSE); - kata_hira = svp ? SvTRUE(*svp) : FALSE; - svp = hv_fetch(selfHV, "UCA_Version", 11, FALSE); - uca_vers = SvIV(*svp); - svp = hv_fetch(selfHV, "variable", 8, FALSE); - v2i = uca_vers >= 9 && svp /* (vers >= 9) and not (non-ignorable) */ - ? !(SvCUR(*svp) == 13 && memEQ(SvPVX(*svp), "non-ignorable", 13)) - : FALSE; - - last_is_var = FALSE; - for (i = 0; i <= buf_len; i++) { - svp = av_fetch(bufAV, i, FALSE); - - if (svp && SvPOK(*svp)) - v = SvPV(*svp, vlen); - else - croak("not a vwt."); - - if (vlen < VCE_Length) /* ignore short VCE (unexpected) */ - continue; - - /* "Ignorable (L1, L2) after Variable" since track. v. 9 */ - if (v2i) { - if (*v) - last_is_var = TRUE; - else if (v[1] || v[2]) /* non zero primary weight */ - last_is_var = FALSE; - else if (last_is_var) /* zero primary weight; skipped */ - continue; - } - - if (v[5] == 0) { /* tert wt < 256 */ - if (upper_lower) { - if (0x8 <= v[6] && v[6] <= 0xC) /* lower */ - v[6] -= 6; - else if (0x2 <= v[6] && v[6] <= 0x6) /* upper */ - v[6] += 6; - else if (v[6] == 0x1C) /* square upper */ - v[6]++; - else if (v[6] == 0x1D) /* square lower */ - v[6]--; - } - if (kata_hira) { - if (0x0F <= v[6] && v[6] <= 0x13) /* katakana */ - v[6] -= 2; - else if (0xD <= v[6] && v[6] <= 0xE) /* hiragana */ - v[6] += 5; - } - } - - for (lv = 0; lv < level; lv++) { - if (v[2 * lv + 1] || v[2 * lv + 2]) { - *s[lv]++ = v[2 * lv + 1]; - *s[lv]++ = v[2 * lv + 2]; - } - } - } - - dlen = 2 * (MaxLevel - 1); - for (lv = 0; lv < level; lv++) - dlen += s[lv] - eachlevel[lv]; - - dst = newSV(dlen); - (void)SvPOK_only(dst); - d = SvPVX(dst); - - svp = hv_fetch(selfHV, "backwardsFlag", 13, FALSE); - back_flag = svp ? SvUV(*svp) : (UV)0; - - for (lv = 0; lv < level; lv++) { - if (back_flag & (1 << (lv + 1))) { - p = s[lv]; - e = eachlevel[lv]; - for ( ; e < p; p -= 2) { - *d++ = p[-2]; - *d++ = p[-1]; - } - } - else { - p = eachlevel[lv]; - e = s[lv]; - while (p < e) - *d++ = *p++; - } - if (lv + 1 < MaxLevel) { /* lv + 1 == real level */ - *d++ = '\0'; - *d++ = '\0'; - } - } - - for (lv = level; lv < MaxLevel; lv++) { - if (lv + 1 < MaxLevel) { /* lv + 1 == real level */ - *d++ = '\0'; - *d++ = '\0'; - } - } - - for (lv = 0; lv < level; lv++) { - Safefree(eachlevel[lv]); - } - } - *d = '\0'; - SvCUR_set(dst, d - (U8*)SvPVX(dst)); - RETVAL = dst; -OUTPUT: - RETVAL - - -SV* -_varCE (vbl, vce) - SV* vbl - SV* vce - PREINIT: - SV *dst; - U8 *a, *v, *d; - STRLEN alen, vlen; - CODE: - a = (U8*)SvPV(vbl, alen); - v = (U8*)SvPV(vce, vlen); - - dst = newSV(vlen); - d = (U8*)SvPVX(dst); - (void)SvPOK_only(dst); - Copy(v, d, vlen, U8); - SvCUR_set(dst, vlen); - d[vlen] = '\0'; - - /* variable: checked only the first char and the length, - trusting checkCollator() and %VariableOK in Perl ... */ - - if (vlen < VCE_Length /* ignore short VCE (unexpected) */ - || - *a == 'n') /* 'non-ignorable' */ - 1; - else if (*v) { - if (*a == 's') { /* shifted or shift-trimmed */ - d[7] = d[1]; /* wt level 1 to 4 */ - d[8] = d[2]; - } - d[1] = d[2] = d[3] = d[4] = d[5] = d[6] = '\0'; - } - else if (*a == 'b') /* blanked */ - 1; - else if (*a == 's') { /* shifted or shift-trimmed */ - if (alen == 7 && (d[1] + d[2] + d[3] + d[4] + d[5] + d[6])) { - d[7] = (U8)(Shift4Wt >> 8); - d[8] = (U8)(Shift4Wt & 0xFF); - } - else { - d[7] = d[8] = 0; - } - } - else - croak("unknown variable value '%s'", a); - RETVAL = dst; -OUTPUT: - RETVAL - - - -SV* -visualizeSortKey (self, key) - SV * self - SV * key - PREINIT: - HV *selfHV; - SV **svp, *dst; - U8 *s, *e, *d; - STRLEN klen, dlen; - UV uv; - IV uca_vers; - static char *upperhex = "0123456789ABCDEF"; - CODE: - if (SvROK(self) && SvTYPE(SvRV(self)) == SVt_PVHV) - selfHV = (HV*)SvRV(self); - else - croak("$self is not a HASHREF."); - - svp = hv_fetch(selfHV, "UCA_Version", 11, FALSE); - if (!svp) - croak("Panic: no $self->{UCA_Version} in visualizeSortKey"); - uca_vers = SvIV(*svp); - - s = (U8*)SvPV(key, klen); - - /* slightly *longer* than the need, but I'm afraid of miscounting; - exactly: (klen / 2) * 5 + MaxLevel * 2 - 1 (excluding '\0') - = (klen / 2) * 5 - 1 # FFFF (16bit) and ' ' between 16bit units - + (MaxLevel - 1) * 2 # ' ' and '|' for level boundaries - + 2 # '[' and ']' - */ - dlen = (klen / 2) * 5 + MaxLevel * 2 + 2; - dst = newSV(dlen); - (void)SvPOK_only(dst); - d = (U8*)SvPVX(dst); - - *d++ = '['; - for (e = s + klen; s < e; s += 2) { - uv = (U16)(*s << 8 | s[1]); - if (uv) { - if ((d[-1] != '[') && ((9 <= uca_vers) || (d[-1] != '|'))) - *d++ = ' '; - *d++ = upperhex[ (s[0] >> 4) & 0xF ]; - *d++ = upperhex[ s[0] & 0xF ]; - *d++ = upperhex[ (s[1] >> 4) & 0xF ]; - *d++ = upperhex[ s[1] & 0xF ]; - } - else { - if ((9 <= uca_vers) && (d[-1] != '[')) - *d++ = ' '; - *d++ = '|'; - } - } - *d++ = ']'; - *d = '\0'; - SvCUR_set(dst, d - (U8*)SvPVX(dst)); - RETVAL = dst; -OUTPUT: - RETVAL - - - -void -unpack_U (src) - SV* src - PREINIT: - STRLEN srclen, retlen; - U8 *s, *p, *e; - UV uv; - PPCODE: - s = (U8*)SvPV(src,srclen); - if (!SvUTF8(src)) { - SV* tmpsv = sv_mortalcopy(src); - if (!SvPOK(tmpsv)) - (void)sv_pvn_force(tmpsv,&srclen); - sv_utf8_upgrade(tmpsv); - s = (U8*)SvPV(tmpsv,srclen); - } - e = s + srclen; - - for (p = s; p < e; p += retlen) { - uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF); - if (!retlen) - croak(ErrRetlenIsZero); - XPUSHs(sv_2mortal(newSVuv(uv))); - } - diff --git a/cpan/Unicode-Collate/Collate/Locale.pm b/cpan/Unicode-Collate/Collate/Locale.pm index 39f04fc..5dddfb8 100644 --- a/cpan/Unicode-Collate/Collate/Locale.pm +++ b/cpan/Unicode-Collate/Collate/Locale.pm @@ -4,11 +4,12 @@ use strict; use Carp; use base qw(Unicode::Collate); -our $VERSION = '0.70'; +our $VERSION = '0.68'; use File::Spec; (my $ModPath = $INC{'Unicode/Collate/Locale.pm'}) =~ s/\.pm$//; +my $KeyPath = File::Spec->catfile('allkeys.txt'); my $PL_EXT = '.pl'; my %LocaleFile = map { ($_, $_) } qw( @@ -70,6 +71,7 @@ sub new { if (exists $hash{table}) { croak "your table can't be used with Unicode::Collate::Locale"; } + $hash{table} = $KeyPath; my $href = _fetchpl($hash{accepted_locale}); while (my($k,$v) = each %$href) { @@ -295,7 +297,7 @@ tailored as well as it. For example, even though W is tailored, fullwidth W (C), W with acute (C), etc. are not tailored. The result may depend on whether source strings are normalized or not, and whether decomposed or composed. -Thus C<(normalization =E undef)> is less preferred. +Thus C<(normalization =E undef> is less preferred. =back @@ -303,7 +305,7 @@ Thus C<(normalization =E undef)> is less preferred. The Unicode::Collate::Locale module for perl was written by SADAHIRO Tomoyuki, . -This module is Copyright(C) 2004-2011, SADAHIRO Tomoyuki. Japan. +This module is Copyright(C) 2004-2010, SADAHIRO Tomoyuki. Japan. All rights reserved. This module is free software; you can redistribute it and/or diff --git a/cpan/Unicode-Collate/Makefile.PL b/cpan/Unicode-Collate/Makefile.PL deleted file mode 100644 index 30d6fc0..0000000 --- a/cpan/Unicode-Collate/Makefile.PL +++ /dev/null @@ -1,28 +0,0 @@ -require 5.006001; -use ExtUtils::MakeMaker; - -my $clean = {}; - -if (-f "Collate.xs") { - print STDERR "Making header files for XS...\n"; - - do 'mkheader' or die $@ || "mkheader: $!"; - - $clean = { FILES => 'ucatbl.h' }; -} - -WriteMakefile( - 'INSTALLDIRS' => $] >= 5.007002 ? 'perl' : 'site', - 'NAME' => 'Unicode::Collate', - 'VERSION_FROM' => 'Collate.pm', # finds $VERSION - 'clean' => $clean, - 'PREREQ_PM' => { - Carp => 0, - constant => 0, - DynaLoader => 0, - File::Spec => 0, - strict => 0, - Test => 0, - warnings => 0, - }, -); diff --git a/cpan/Unicode-Collate/README b/cpan/Unicode-Collate/README index 7142c5f..16bf8c4 100644 --- a/cpan/Unicode-Collate/README +++ b/cpan/Unicode-Collate/README @@ -1,4 +1,4 @@ -Unicode/Collate version 0.70 +Unicode/Collate version 0.68 =============================== NAME @@ -40,7 +40,6 @@ INSTALL gendata/*, and mklocale. Tests for Unicode::Collate::Locale are named t/loc_*.t. -Since 0.54, XSUB that requires a C compiler will be built by default. To install this module type the following: perl Makefile.PL @@ -48,20 +47,20 @@ To install this module type the following: make test make install -Even if a C compiler is not available, pure Perl (i.e. non-XS) edition -is available; type the following: +If you have a C compiler and want to use XSUB edition, +type the following (!! "enableXS" must run before "Makefile.PL" !!): - perl disableXS + perl enableXS perl Makefile.PL make make test make install -If you decide to install XSUB edition after trying to build pure Perl, -type the following: +If you decide to install pure Perl (i.e. non-XS) edition after trying +to build XSUB, type the following: make clean - perl enableXS + perl disableXS perl Makefile.PL make make test @@ -108,7 +107,7 @@ HOW TO CHANGE DUCET (NOT WARRANTED) AUTHOR, COPYRIGHT AND LICENSE The Unicode::Collate module for perl was written by SADAHIRO Tomoyuki, -. This module is Copyright(C) 2001-2011, +. This module is Copyright(C) 2001-2010, SADAHIRO Tomoyuki. Japan. All rights reserved. This module is free software; you can redistribute it and/or diff --git a/cpan/Unicode-Collate/mkheader b/cpan/Unicode-Collate/mkheader deleted file mode 100644 index dde4ee1..0000000 --- a/cpan/Unicode-Collate/mkheader +++ /dev/null @@ -1,196 +0,0 @@ -#!perl -# -# This auxiliary script makes five header files -# used for building XSUB of Unicode::Collate. -# -# Usage: -# in perl, or in command line -# -# Input file: -# Collate/allkeys.txt -# -# Output file: -# ucatbl.h -# -use 5.006; -use strict; -use warnings; -use Carp; -use File::Spec; - -BEGIN { - unless ("A" eq pack('U', 0x41)) { - die "Unicode::Collate cannot stringify a Unicode code point\n"; - } -} - -use constant TRUE => 1; -use constant FALSE => ""; -use constant VCE_TEMPLATE => 'Cn4'; - -sub _getHexArray { map hex, $_[0] =~ /([0-9a-fA-F]+)/g } - -our $PACKAGE = 'Unicode::Collate, mkheader'; -our $prefix = "UCA_"; - -our %SimpleEntries; # $codepoint => $keys -our @Rest; - -{ - my($f, $fh); - foreach my $d ('.') { - $f = File::Spec->catfile($d, "Collate", "allkeys.txt"); - last if open($fh, $f); - $f = undef; - } - croak "$PACKAGE: Collate/allkeys.txt is not found" if !defined $f; - - while (my $line = <$fh>) { - next if $line =~ /^\s*#/; - if ($line =~ /^\s*\@/) { - push @Rest, $line; - next; - } - - next if $line !~ /^\s*[0-9A-Fa-f]/; - - $line =~ s/[#%]\s*(.*)//; # removing comment (not getting the name) - - # gets element - my($e, $k) = split /;/, $line; - - croak "Wrong Entry: must be separated by ';' ". - "from " if ! $k; - - my @uv = _getHexArray($e); - next if !@uv; - - if (@uv != 1) { - push @Rest, $line; - next; - } - - my $is_L3_ignorable = TRUE; - - my @key; - foreach my $arr ($k =~ /\[([^\[\]]+)\]/g) { # SPACEs allowed - my $var = $arr =~ /\*/; # exactly /^\*/ but be lenient. - my @wt = _getHexArray($arr); - push @key, pack(VCE_TEMPLATE, $var, @wt); - $is_L3_ignorable = FALSE - if $wt[0] || $wt[1] || $wt[2]; - # Conformance Test for 3.1.1 and 4.0.0 shows Level 3 ignorable - # is completely ignorable. - # For expansion, an entry $is_L3_ignorable - # if and only if "all" CEs are [.0000.0000.0000]. - } - my $mapping = $is_L3_ignorable ? [] : \@key; - my $num = @$mapping; - my $str = chr($num).join('', @$mapping); - $SimpleEntries{$uv[0]} = stringify($str); - } -} - -sub stringify { - my $str = shift; - return sprintf '"%s"', join '', - map sprintf("\\x%02x", ord $_), split //, $str; - -} - -########## writing header files ########## - -my $init = ''; -{ - my $type = "char*"; - my $head = $prefix."rest"; - - $init .= "static $type $head [] = {\n"; - for my $line (@Rest) { - $line =~ s/\s*\z//; - next if $line eq ''; - $init .= "/*$line*/\n" if $line =~ /^[A-Za-z0-9_.:;@\ \[\]]+\z/; - $init .= "($type)".stringify($line).",\n"; - } - $init .= "NULL\n"; # sentinel - $init .= "};\n\n"; -} - -my @tripletable = ( - { - file => "ucatbl", - name => "simple", - type => "char*", - hash => \%SimpleEntries, - null => "NULL", - init => $init, - }, -); - -foreach my $tbl (@tripletable) { - my $file = "$tbl->{file}.h"; - my $head = "${prefix}$tbl->{name}"; - my $type = $tbl->{type}; - my $hash = $tbl->{hash}; - my $null = $tbl->{null}; - my $init = $tbl->{init}; - - open FH, ">$file" or croak "$PACKAGE: $file can't be made"; - binmode FH; select FH; - my %val; - - print FH << 'EOF'; -/* - * This file is auto-generated by mkheader. - * Any changes here will be lost! - */ -EOF - - print $init if defined $init; - - foreach my $uv (keys %$hash) { - croak sprintf("a Unicode code point 0x%04X over 0x10FFFF.", $uv) - unless $uv <= 0x10FFFF; - my @c = unpack 'CCCC', pack 'N', $uv; - $val{ $c[1] }{ $c[2] }{ $c[3] } = $hash->{$uv}; - } - - foreach my $p (sort { $a <=> $b } keys %val) { - next if ! $val{ $p }; - for (my $r = 0; $r < 256; $r++) { - next if ! $val{ $p }{ $r }; - printf "static $type ${head}_%02x_%02x [256] = {\n", $p, $r; - for (my $c = 0; $c < 256; $c++) { - print "\t", defined $val{$p}{$r}{$c} - ? "($type)".$val{$p}{$r}{$c} - : $null; - print ',' if $c != 255; - print "\n" if $c % 8 == 7; - } - print "};\n\n"; - } - } - foreach my $p (sort { $a <=> $b } keys %val) { - next if ! $val{ $p }; - printf "static $type* ${head}_%02x [256] = {\n", $p; - for (my $r = 0; $r < 256; $r++) { - print $val{ $p }{ $r } - ? sprintf("${head}_%02x_%02x", $p, $r) - : "NULL"; - print ',' if $r != 255; - print "\n" if $val{ $p }{ $r } || ($r+1) % 8 == 0; - } - print "};\n\n"; - } - print "static $type** $head [] = {\n"; - for (my $p = 0; $p <= 0x10; $p++) { - print $val{ $p } ? sprintf("${head}_%02x", $p) : "NULL"; - print ',' if $p != 0x10; - print "\n"; - } - print "};\n\n"; - close FH; -} - -1; -__END__ diff --git a/cpan/Unicode-Collate/t/loc_test.t b/cpan/Unicode-Collate/t/loc_test.t index 60c9773..d1b5b4a 100644 --- a/cpan/Unicode-Collate/t/loc_test.t +++ b/cpan/Unicode-Collate/t/loc_test.t @@ -12,7 +12,7 @@ BEGIN { } use Test; -BEGIN { plan tests => 120 }; +BEGIN { plan tests => 116 }; use strict; use warnings; @@ -127,13 +127,3 @@ our @sortFr = $objFr->sort(@randFr); ok("@sortFr" eq "@listFr"); # 116 - -{ - my $keyXS = '__useXS'; # see Unicode::Collate internal - my $UseXS = ref Unicode::Collate->new->{$keyXS}; - ok(ref($Collator->{$keyXS}), $UseXS); - ok(ref($objFr ->{$keyXS}), $UseXS); - ok(ref($objEs ->{$keyXS}), $UseXS); - ok(ref($objEsT ->{$keyXS}), $UseXS); -} -# 120 -- 2.7.4