# Input files:
# unicore/CombiningClass.pl (or unicode/CombiningClass.pl)
# unicore/Decomposition.pl (or unicode/Decomposition.pl)
-# unicore/CompositionExclusions.txt (or unicode/CompExcl.txt)
#
# Output files:
# unfcan.h
our %NonStD; # $codepoint => 1 : non-starter decompositions
our %Comp2nd; # $codepoint => 1 : may be composed with a prev char.
-# from Unicode database
+# from core Unicode database
our $Combin = do "unicore/CombiningClass.pl"
|| do "unicode/CombiningClass.pl"
|| croak "$PACKAGE: CombiningClass.pl not found";
|| do "unicode/Decomposition.pl"
|| croak "$PACKAGE: Decomposition.pl not found";
+# CompositionExclusions.txt since Unicode 3.2.0
+our @CompEx = qw(
+ 0958 0959 095A 095B 095C 095D 095E 095F 09DC 09DD 09DF 0A33 0A36
+ 0A59 0A5A 0A5B 0A5E 0B5C 0B5D 0F43 0F4D 0F52 0F57 0F5C 0F69 0F76
+ 0F78 0F93 0F9D 0FA2 0FA7 0FAC 0FB9 FB1D FB1F FB2A FB2B FB2C FB2D
+ FB2E FB2F FB30 FB31 FB32 FB33 FB34 FB35 FB36 FB38 FB39 FB3A FB3B
+ FB3C FB3E FB40 FB41 FB43 FB44 FB46 FB47 FB48 FB49 FB4A FB4B FB4C
+ FB4D FB4E 2ADC 1D15E 1D15F 1D160 1D161 1D162 1D163 1D164 1D1BB
+ 1D1BC 1D1BD 1D1BE 1D1BF 1D1C0
+);
+
# definition of Hangul constants
use constant SBase => 0xAC00;
use constant SFinal => 0xD7A3; # SBase -1 + SCount
}
########## getting full decomposition ##########
-{
- my($f, $fh);
- foreach my $d (@INC) {
- $f = File::Spec->catfile($d, "unicore", "CompositionExclusions.txt");
- last if open($fh, $f);
- $f = File::Spec->catfile($d, "unicore", "CompExcl.txt");
- last if open($fh, $f);
- $f = File::Spec->catfile($d, "unicode", "CompExcl.txt");
- last if open($fh, $f);
- $f = undef;
- }
- croak "$PACKAGE: neither unicore/CompositionExclusions.txt "
- . "nor unicode/CompExcl.txt is found in @INC" unless defined $f;
-
- while (<$fh>) {
- next if /^#/ or /^$/;
- s/#.*//;
- $Exclus{ hex($1) } = 1 if /([0-9A-Fa-f]+)/;
- }
- close $fh;
-}
## converts string "hhhh hhhh hhhh" to a numeric list
## (hex digits separated by spaces)
foreach my $u ($ini .. $end) {
$Compat{$u} = $dec;
+ $Canon{$u} = $dec if ! $compat;
+ }
+}
- if (! $compat) {
- $Canon{$u} = $dec;
-
- if (@$dec == 2) {
- if ($Combin{ $dec->[0] }) {
- $NonStD{$u} = 1;
- } else {
- $Compos{ $dec->[0] }{ $dec->[1] } = $u;
- $Comp2nd{ $dec->[1] } = 1 if ! $Exclus{$u};
- }
- } elsif (@$dec == 1) {
- $Single{$u} = 1;
- } else {
- croak("Weird Canonical Decomposition of U+$tab[0]");
- }
+for my $s (@CompEx) {
+ my $u = hex $s;
+ next if !$Canon{$u}; # not assigned
+ next if $u == 0xFB1D && !$Canon{0x1D15E}; # 3.0.1 before Corrigendum #2
+ $Exclus{$u} = 1;
+}
+
+foreach my $u (keys %Canon) {
+ my $dec = $Canon{$u};
+
+ if (@$dec == 2) {
+ if ($Combin{ $dec->[0] }) {
+ $NonStD{$u} = 1;
+ } else {
+ $Compos{ $dec->[0] }{ $dec->[1] } = $u;
+ $Comp2nd{ $dec->[1] } = 1 if ! $Exclus{$u};
}
+ } elsif (@$dec == 1) {
+ $Single{$u} = 1;
+ } else {
+ my $h = sprintf '%04X', $u;
+ croak("Weird Canonical Decomposition of U+$h");
}
}
use Test;
use strict;
use warnings;
-BEGIN { plan tests => 211 };
+BEGIN { plan tests => 217 };
use Unicode::Normalize qw(:all);
ok(1); # If we made it this far, we're ok.
ok(getCanon(0x3243), undef);
ok(getCanon(0xFA2D), _pack_U(0x9DB4));
+# 20
+
ok(getCompat( 0), undef);
ok(getCompat(0x29), undef);
ok(getCompat(0x41), undef);
ok(getComposite(0xAC00, 0x11A8), 0xAC01);
ok(getComposite(0xADF8, 0x11AF), 0xAE00);
+# 53
+
sub uprops {
my $uv = shift;
my $r = "";
ok(uprops(0xFB4E), 'XsnFbDmCKyG'); # HEBREW LETTER PE WITH RAFE
ok(uprops(0xFF71), 'xsnfbdmcKyG'); # HALFWIDTH KATAKANA LETTER A
+# 71
+
ok(decompose(""), "");
ok(decompose("A"), "A");
ok(decompose("", 1), "");
ok(decompose(hexU("AC00 AE00"), 1), hexU("1100 1161 1100 1173 11AF"));
ok(decompose(hexU("304C FF76"), 1), hexU("304B 3099 30AB"));
+# 81
+
# don't modify the source
my $sDec = "\x{FA19}";
ok(decompose($sDec), "\x{795E}");
ok(compose($sCom), "\x{304C}");
ok($sCom, "\x{304B}\x{3099}");
+# 100
+
ok(composeContiguous(""), "");
ok(composeContiguous("A"), "A");
ok(composeContiguous(hexU("0061 0300")), hexU("00E0"));
ok(composeContiguous($sCtg), "\x{30DD}");
ok($sCtg, "\x{30DB}\x{309A}");
+# 111
+
sub answer { defined $_[0] ? $_[0] ? "YES" : "NO" : "MAYBE" }
ok(answer(checkNFD("")), "YES");
ok(answer(checkNFKC(hexU("0041 030A 0327"))), "NO"); # A+ring+cedilla
ok(answer(check("NFKC", hexU("20 C1 212B 300"))), "NO");
+# 145
+
"012ABC" =~ /(\d+)(\w+)/;
ok("012" eq NFC $1 && "ABC" eq NFC $2);
ok(isNFD_NO ("0192"));
ok(isNFKD_NO("0192"));
+# 156
+
# DEVANAGARI LETTER QA
ok(isExclusion("02392"));
ok(isComp_Ex ("02392"));
ok(getCompat("044032"), _pack_U(0x1100, 0x1161));
ok(getComposite("04352", "04449"), 0xAC00);
+# 182
+
# string with 22 combining characters: (0x300..0x315)
my $str_cc22 = _pack_U(0x3041, 0x300..0x315, 0x3042);
ok(decompose($str_cc22), $str_cc22);
ok(FCD($str_cc40), $str_cc40);
ok(FCC($str_cc40), $str_cc40);
+# 202
+
my $precomp = hexU("304C 304E 3050 3052 3054");
my $combseq = hexU("304B 3099 304D 3099 304F 3099 3051 3099 3053 3099");
ok(decompose($precomp x 5), $combseq x 5);
ok(decompose($precomp . $notcomp x 5), $combseq . $notcomp x 5);
ok(decompose($precomp . $notcomp x10), $combseq . $notcomp x10);
+# 211
+
+my $preUnicode3_1 = !defined getCanon(0x1D15E);
+my $preUnicode3_2 = !defined getCanon(0x2ADC);
+
+# HEBREW LETTER YOD WITH HIRIQ
+ok($preUnicode3_1 xor isExclusion(0xFB1D));
+ok($preUnicode3_1 xor isComp_Ex (0xFB1D));
+
+# MUSICAL SYMBOL HALF NOTE
+ok($preUnicode3_1 xor isExclusion(0x1D15E));
+ok($preUnicode3_1 xor isComp_Ex (0x1D15E));
+
+# FORKING
+ok($preUnicode3_2 xor isExclusion(0x2ADC));
+ok($preUnicode3_2 xor isComp_Ex (0x2ADC));
+# 217