return;
}
+sub han_charname {
+ my $arg = shift;
+ my $code = _getcode($arg);
+ croak __PACKAGE__, "::charinfo: unknown code '$arg'"
+ unless defined $code;
+ croak __PACKAGE__, "::han_charname: outside CJK Unified Ideographs '$arg'"
+ unless 0x3400 <= $code && $code <= 0x4DB5
+ || 0x4E00 <= $code && $code <= 0x9FA5
+ || 0x20000 <= $code && $code <= 0x2A6D6;
+ sprintf "CJK UNIFIED IDEOGRAPH-%04X", $code;
+}
+
+my @JamoL = ( # Leading Consonant (HANGUL CHOSEONG)
+ "G", "GG", "N", "D", "DD", "R", "M", "B", "BB",
+ "S", "SS", "", "J", "JJ", "C", "K", "T", "P", "H",
+ );
+
+my @JamoV = ( # Medium Vowel (HANGUL JUNGSEONG)
+ "A", "AE", "YA", "YAE", "EO", "E", "YEO", "YE", "O",
+ "WA", "WAE", "OE", "YO", "U", "WEO", "WE", "WI",
+ "YU", "EU", "YI", "I",
+ );
+
+my @JamoT = ( # Trailing Consonant (HANGUL JONGSEONG)
+ "", "G", "GG", "GS", "N", "NJ", "NH", "D", "L", "LG", "LM",
+ "LB", "LS", "LT", "LP", "LH", "M", "B", "BS",
+ "S", "SS", "NG", "J", "C", "K", "T", "P", "H",
+ );
+
+my %HangulConst = (
+ SBase => 0xAC00,
+ LBase => 0x1100,
+ VBase => 0x1161,
+ TBase => 0x11A7,
+ LCount => 19, # scalar @JamoL
+ VCount => 21, # scalar @JamoV
+ TCount => 28, # scalar @JamoT
+ NCount => 588, # VCount * TCount
+ SCount => 11172, # LCount * NCount
+ Final => 0xD7A3, # SBase -1 + SCount
+ );
+
+sub hangul_charname {
+ my $arg = shift;
+ my $code = _getcode($arg);
+ croak __PACKAGE__, "::charinfo: unknown code '$arg'"
+ unless defined $code;
+ croak __PACKAGE__, "::hangul_charname: outside Hangul Syllables '$arg'"
+ unless $HangulConst{SBase} <= $code && $code <= $HangulConst{Final};
+ my $SIndex = $code - $HangulConst{SBase};
+ my $LIndex = int( $SIndex / $HangulConst{NCount});
+ my $VIndex = int(($SIndex % $HangulConst{NCount}) / $HangulConst{TCount});
+ my $TIndex = $SIndex % $HangulConst{TCount};
+ return join('',
+ "HANGUL SYLLABLE ",
+ $JamoL[$LIndex],
+ $JamoV[$VIndex],
+ $JamoT[$TIndex],
+ );
+}
+
+sub hangul_decomp {
+ my $arg = shift;
+ my $code = _getcode($arg);
+ croak __PACKAGE__, "::charinfo: unknown code '$arg'"
+ unless defined $code;
+ croak __PACKAGE__, "::hangul_decomp: outside Hangul Syllables '$arg'"
+ unless $HangulConst{SBase} <= $code && $code <= $HangulConst{Final};
+ my $SIndex = $code - $HangulConst{SBase};
+ my $LIndex = int( $SIndex / $HangulConst{NCount});
+ my $VIndex = int(($SIndex % $HangulConst{NCount}) / $HangulConst{TCount});
+ my $TIndex = $SIndex % $HangulConst{TCount};
+
+ return join(" ",
+ sprintf("%04X", $HangulConst{LBase} + $LIndex),
+ sprintf("%04X", $HangulConst{VBase} + $VIndex),
+ $TIndex ?
+ sprintf("%04X", $HangulConst{TBase} + $TIndex) : (),
+ );
+}
+
+my @CharinfoRanges = (
+# block name
+# [ first, last, coderef to name, coderef to decompose ],
+# CJK Ideographs Extension A
+ [ 0x3400, 0x4DB5, \&han_charname, undef ],
+# CJK Ideographs
+ [ 0x4E00, 0x9FA5, \&han_charname, undef ],
+# Hangul Syllables
+ [ 0xAC00, 0xD7A3, \&hangul_charname, \&hangul_decomp ],
+# Non-Private Use High Surrogates
+ [ 0xD800, 0xDB7F, undef, undef ],
+# Private Use High Surrogates
+ [ 0xDB80, 0xDBFF, undef, undef ],
+# Low Surrogates
+ [ 0xDC00, 0xDFFF, undef, undef ],
+# The Private Use Area
+ [ 0xE000, 0xF8FF, undef, undef ],
+# CJK Ideographs Extension B
+ [ 0x20000, 0x2A6D6, \&han_charname, undef ],
+# Plane 15 Private Use Area
+ [ 0xF0000, 0xFFFFD, undef, undef ],
+# Plane 16 Private Use Area
+ [ 0x100000, 0x10FFFD, undef, undef ],
+);
+
sub charinfo {
my $arg = shift;
my $code = _getcode($arg);
croak __PACKAGE__, "::charinfo: unknown code '$arg'"
unless defined $code;
my $hexk = sprintf("%04X", $code);
-
- openunicode(\$UNICODEFH, "Unicode.txt");
+ my($rcode,$rname,$rdec);
+ foreach my $range (@CharinfoRanges){
+ if($range->[0] <= $code && $code <= $range->[1]){
+ $rcode = $hexk;
+ $rname = $range->[2] ? $range->[2]->($code) : '';
+ $rdec = $range->[3] ? $range->[3]->($code) : '';
+ $hexk = sprintf("%04X",$range->[0]); # replace by the first
+ last;
+ }
+ }
+ openunicode(\$UNICODEFH, "Unicode.sort"); # sorted
if (defined $UNICODEFH) {
use Search::Dict;
if (look($UNICODEFH, "$hexk;") >= 0) {
if ($prop{code} eq $hexk) {
$prop{block} = charblock($code);
$prop{script} = charscript($code);
+ if(defined $rname){
+ $prop{code} = $rcode;
+ $prop{name} = $rname;
+ $prop{decomposition} = $rdec;
+ }
return \%prop;
}
}
use Test;
use strict;
-BEGIN { plan tests => 111 };
+BEGIN { plan tests => 111 + 17 * 3};
use UnicodeCD 'charinfo';
ok($charinfo->{block}, 'Hebrew');
ok($charinfo->{script}, 'Hebrew');
+# an open syllable in Hangul
+
+$charinfo = charinfo(0xAC00);
+
+ok($charinfo->{code}, 'AC00');
+ok($charinfo->{name}, 'HANGUL SYLLABLE GA');
+ok($charinfo->{category}, 'Lo');
+ok($charinfo->{combining}, '0');
+ok($charinfo->{bidi}, 'L');
+ok($charinfo->{decomposition}, '1100 1161');
+ok($charinfo->{decimal}, '');
+ok($charinfo->{digit}, '');
+ok($charinfo->{numeric}, '');
+ok($charinfo->{mirrored}, 'N');
+ok($charinfo->{unicode10}, '');
+ok($charinfo->{comment}, '');
+ok($charinfo->{upper}, '');
+ok($charinfo->{lower}, '');
+ok($charinfo->{title}, '');
+ok($charinfo->{block}, 'Hangul Syllables');
+ok($charinfo->{script}, 'Hangul');
+
+# a close syllable in Hangul
+
+$charinfo = charinfo(0xAE00);
+
+ok($charinfo->{code}, 'AE00');
+ok($charinfo->{name}, 'HANGUL SYLLABLE GEUL');
+ok($charinfo->{category}, 'Lo');
+ok($charinfo->{combining}, '0');
+ok($charinfo->{bidi}, 'L');
+ok($charinfo->{decomposition}, '1100 1173 11AF');
+ok($charinfo->{decimal}, '');
+ok($charinfo->{digit}, '');
+ok($charinfo->{numeric}, '');
+ok($charinfo->{mirrored}, 'N');
+ok($charinfo->{unicode10}, '');
+ok($charinfo->{comment}, '');
+ok($charinfo->{upper}, '');
+ok($charinfo->{lower}, '');
+ok($charinfo->{title}, '');
+ok($charinfo->{block}, 'Hangul Syllables');
+ok($charinfo->{script}, 'Hangul');
+
+$charinfo = charinfo(0x1D400);
+
+ok($charinfo->{code}, '1D400');
+ok($charinfo->{name}, 'MATHEMATICAL BOLD CAPITAL A');
+ok($charinfo->{category}, 'Lu');
+ok($charinfo->{combining}, '0');
+ok($charinfo->{bidi}, 'L');
+ok($charinfo->{decomposition}, '<font> 0041');
+ok($charinfo->{decimal}, '');
+ok($charinfo->{digit}, '');
+ok($charinfo->{numeric}, '');
+ok($charinfo->{mirrored}, 'N');
+ok($charinfo->{unicode10}, '');
+ok($charinfo->{comment}, '');
+ok($charinfo->{upper}, '');
+ok($charinfo->{lower}, '');
+ok($charinfo->{title}, '');
+ok($charinfo->{block}, 'Mathematical Alphanumeric Symbols');
+ok($charinfo->{script}, undef);
+
use UnicodeCD qw(charblock charscript);
# 0x0590 is in the Hebrew block but unused.