our @EXPORT_OK = qw(charinfo
charblock charscript
charblocks charscripts
- charinrange);
+ charinrange
+ compexcl
+ casefold casespec);
use Carp;
=head1 SYNOPSIS
use Unicode::UCD 'charinfo';
- my %charinfo = charinfo($codepoint);
+ my $charinfo = charinfo($codepoint);
use Unicode::UCD 'charblock';
my $charblock = charblock($codepoint);
my $BLOCKSFH;
my $SCRIPTSFH;
my $VERSIONFH;
+my $COMPEXCLFH;
+my $CASEFOLDFH;
+my $CASESPECFH;
sub openunicode {
my ($rfh, @path) = @_;
use Unicode::UCD 'charinfo';
- my %charinfo = charinfo(0x41);
+ my $charinfo = charinfo(0x41);
-charinfo() returns a hash that has the following fields as defined
-by the Unicode standard:
+charinfo() returns a reference to a hash that has the following fields
+as defined by the Unicode standard:
key
block block the character belongs to (used in \p{In...})
script script the character belongs to
-If no match is found, an empty hash is returned.
+If no match is found, a reference to an empty hash is returned.
The C<block> property is the same as as returned by charinfo(). It is
not defined in the Unicode Character Database proper (Chapter 4 of the
Note that you cannot do (de)composition and casing based solely on the
above C<decomposition> and C<lower>, C<upper>, C<title>, properties,
-you will need also the I<Composition Exclusions>, I<Case Folding>, and
-I<SpecialCasing> tables, available as files F<CompExcl.txt>,
-F<CaseFold.txt>, and F<SpecCase.txt> in the Perl distribution.
+you will need also the compexcl(), casefold(), and casespec() functions.
=cut
if ($prop{code} eq $hexk) {
$prop{block} = charblock($code);
$prop{script} = charscript($code);
- return %prop;
+ return \%prop;
}
}
}
With a B<code point argument> charblock() returns the block the character
belongs to, e.g. C<Basic Latin>. Note that not all the character
-positions within all blocks are defined. A <code point argument>
-is either a decimal or a hexadecimal scalar, or "U+" followed
-by hexadecimals.
+positions within all blocks are defined.
If supplied with an argument that can't be a code point, charblock()
tries to do the opposite and interpret the argument as a character
my $ranges = charscript('Thai');
With a B<code point argument> charscript() returns the script the
-character belongs to, e.g. C<Latin>, C<Greek>, C<Han>. A <code point
-argument> is either a decimal or a hexadecimal scalar, or "U+"
-followed by hexadecimals.
+character belongs to, e.g. C<Latin>, C<Greek>, C<Han>.
If supplied with an argument that can't be a code point, charscript()
tries to do the opposite and interpret the argument as a character
use Unicode::UCD 'charblocks';
- my %charblocks = charblocks();
+ my $charblocks = charblocks();
-charblocks() returns a hash with the known block names as the keys,
-and the code point ranges (see L</charblock>) as the values.
+charblocks() returns a reference to a hash with the known block names
+as the keys, and the code point ranges (see L</charblock>) as the values.
=cut
sub charblocks {
- _charblocks() unless @BLOCKS;
- return %BLOCKS;
+ _charblocks() unless %BLOCKS;
+ return \%BLOCKS;
}
=head2 charscripts
=cut
sub charscripts {
- _charscripts() unless @SCRIPTS;
- return %SCRIPTS;
+ _charscripts() unless %SCRIPTS;
+ return \%SCRIPTS;
}
=head2 Blocks versus Scripts
the same name, in these cases the block version has C<Block> appended:
C<\p{InKatakana}> is the script, C<\p{InKatakanaBlock}> is the block.
+=head2 Code Point Arguments
+
+A <code point argument> is either a decimal or a hexadecimal scalar,
+or "U+" followed by hexadecimals.
+
=head2 charinrange
In addition to using the C<\p{In...}> and C<\P{In...}> constructs, you
=cut
+=head2 compexcl
+
+ use Unicode::UCD 'compexcl';
+
+ my $compexcl = compexcl("09dc");
+
+The compexcl() returns the composition exclusion (that is, if the
+character cannot be decomposed) of the character specified by a B<code
+point argument>.
+
+If there is a composition exclusion for the character, true is
+returned. Otherwise, false is returned.
+
+=cut
+
+my %COMPEXCL;
+
+sub _compexcl {
+ unless (%COMPEXCL) {
+ if (openunicode(\$COMPEXCLFH, "CompExcl.txt")) {
+ while (<$COMPEXCLFH>) {
+ if (/^([0-9A-F]+) \# /) {
+ my $code = hex($1);
+ $COMPEXCL{$code} = undef;
+ }
+ }
+ close($COMPEXCLFH);
+ }
+ }
+}
+
+sub compexcl {
+ my $arg = shift;
+ my $code = _getcode($arg);
+
+ _compexcl() unless %COMPEXCL;
+
+ return exists $COMPEXCL{$code};
+}
+
+=head2 casefold
+
+ use Unicode::UCD 'casefold';
+
+ my %casefold = casefold("09dc");
+
+The casefold() returns the locale-independent case folding of the
+character specified by a B<code point argument>.
+
+If there is a case folding for that character, a reference to a hash
+with the following fields is returned:
+
+ key
+
+ code code point with at least four hexdigits
+ status "C", "F", "S", or "I"
+ mapping one or more codes separated by spaces
+
+The meaning of the I<status> is as follows:
+
+ C common case folding, common mappings shared
+ by both simple and full mappings
+ F full case folding, mappings that cause strings
+ to grow in length. Multiple characters are separated
+ by spaces
+ S simple case folding, mappings to single characters
+ where different from F
+ I special case for dotted uppercase I and
+ dotless lowercase i
+ - If this mapping is included, the result is
+ case-insensitive, but dotless and dotted I's
+ are not distinguished
+ - If this mapping is excluded, the result is not
+ fully case-insensitive, but dotless and dotted
+ I's are distinguished
+
+If there is no case folding for that character, C<undef> is returned.
+
+For more information about case mappings see
+http://www.unicode.org/unicode/reports/tr21/
+
+=cut
+
+my %CASEFOLD;
+
+sub _casefold {
+ unless (%CASEFOLD) {
+ if (openunicode(\$CASEFOLDFH, "CaseFold.txt")) {
+ while (<$CASEFOLDFH>) {
+ if (/^([0-9A-F]+); ([CFSI]); ([0-9A-F]+(?: [0-9A-F]+)*);/) {
+ my $code = hex($1);
+ $CASEFOLD{$code} = { code => $1,
+ status => $2,
+ mapping => $3 };
+ }
+ }
+ close($CASEFOLDFH);
+ }
+ }
+}
+
+sub casefold {
+ my $arg = shift;
+ my $code = _getcode($arg);
+
+ _casefold() unless %CASEFOLD;
+
+ return $CASEFOLD{$code};
+}
+
+=head2 casespec
+
+ use Unicode::UCD 'casespec';
+
+ my %casespec = casespec("09dc");
+
+The casespec() returns the potentially locale-dependent case mapping
+of the character specified by a B<code point argument>. The mapping
+may change the length of the string (which the basic Unicode case
+mappings as returned by charinfo() never do).
+
+If there is a case folding for that character, a reference to a hash
+with the following fields is returned:
+
+ key
+
+ code code point with at least four hexdigits
+ lower lowercase
+ title titlecase
+ upper uppercase
+ condition condition list (may be undef)
+
+The C<condition> is optional. Where present, it consists of one or
+more I<locales> or I<contexts>, separated by spaces (other than as
+used to separate elements, spaces are to be ignored). A condition
+list overrides the normal behavior if all of the listed conditions are
+true. Case distinctions in the condition list are not significant.
+Conditions preceded by "NON_" represent the negation of the condition
+
+A I<locale> is defined as a 2-letter ISO 3166 country code, possibly
+followed by a "_" and a 2-letter ISO language code (, possibly followed
+by a "_" and a variant code). You can find the list of those codes
+in L<Locale::Country> and L<Locale::Language>.
+
+A I<context> is one of the following choices:
+
+ FINAL The letter is not followed by a letter of
+ general category L (e.g. Ll, Lt, Lu, Lm, or Lo)
+ MODERN The mapping is only used for modern text
+ AFTER_i The last base character was "i" 0069
+
+For more information about case mappings see
+http://www.unicode.org/unicode/reports/tr21/
+
+=cut
+
+my %CASESPEC;
+
+sub _casespec {
+ unless (%CASESPEC) {
+ if (openunicode(\$CASESPECFH, "SpecCase.txt")) {
+ while (<$CASESPECFH>) {
+ if (/^([0-9A-F]+); ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; (\w+(?: \w+)*)?/) {
+ my $code = hex($1);
+ $CASESPEC{$code} = { code => $1,
+ lower => $2,
+ title => $3,
+ upper => $4,
+ condition => $5 };
+ }
+ }
+ close($CASESPECFH);
+ }
+ }
+}
+
+sub casespec {
+ my $arg = shift;
+ my $code = _getcode($arg);
+
+ _casespec() unless %CASESPEC;
+
+ return $CASESPEC{$code};
+}
+
=head2 Unicode::UCD::UnicodeVersion
Unicode::UCD::UnicodeVersion() returns the version of the Unicode Character
use Test;
use strict;
-BEGIN { plan tests => 103 };
+BEGIN { plan tests => 111 };
use Unicode::UCD 'charinfo';
-my %charinfo;
-
-%charinfo = charinfo(0x41);
-
-ok($charinfo{code}, '0041');
-ok($charinfo{name}, 'LATIN CAPITAL LETTER A');
-ok($charinfo{category}, 'Lu');
-ok($charinfo{combining}, '0');
-ok($charinfo{bidi}, 'L');
-ok($charinfo{decomposition}, '');
-ok($charinfo{decimal}, '');
-ok($charinfo{digit}, '');
-ok($charinfo{numeric}, '');
-ok($charinfo{mirrored}, 'N');
-ok($charinfo{unicode10}, '');
-ok($charinfo{comment}, '');
-ok($charinfo{upper}, '');
-ok($charinfo{lower}, '0061');
-ok($charinfo{title}, '');
-ok($charinfo{block}, 'Basic Latin');
-ok($charinfo{script}, 'Latin');
-
-%charinfo = charinfo(0x100);
-
-ok($charinfo{code}, '0100');
-ok($charinfo{name}, 'LATIN CAPITAL LETTER A WITH MACRON');
-ok($charinfo{category}, 'Lu');
-ok($charinfo{combining}, '0');
-ok($charinfo{bidi}, 'L');
-ok($charinfo{decomposition}, '0041 0304');
-ok($charinfo{decimal}, '');
-ok($charinfo{digit}, '');
-ok($charinfo{numeric}, '');
-ok($charinfo{mirrored}, 'N');
-ok($charinfo{unicode10}, 'LATIN CAPITAL LETTER A MACRON');
-ok($charinfo{comment}, '');
-ok($charinfo{upper}, '');
-ok($charinfo{lower}, '0101');
-ok($charinfo{title}, '');
-ok($charinfo{block}, 'Latin Extended-A');
-ok($charinfo{script}, 'Latin');
+my $charinfo;
+
+$charinfo = charinfo(0x41);
+
+ok($charinfo->{code}, '0041');
+ok($charinfo->{name}, 'LATIN CAPITAL LETTER A');
+ok($charinfo->{category}, 'Lu');
+ok($charinfo->{combining}, '0');
+ok($charinfo->{bidi}, 'L');
+ok($charinfo->{decomposition}, '');
+ok($charinfo->{decimal}, '');
+ok($charinfo->{digit}, '');
+ok($charinfo->{numeric}, '');
+ok($charinfo->{mirrored}, 'N');
+ok($charinfo->{unicode10}, '');
+ok($charinfo->{comment}, '');
+ok($charinfo->{upper}, '');
+ok($charinfo->{lower}, '0061');
+ok($charinfo->{title}, '');
+ok($charinfo->{block}, 'Basic Latin');
+ok($charinfo->{script}, 'Latin');
+
+$charinfo = charinfo(0x100);
+
+ok($charinfo->{code}, '0100');
+ok($charinfo->{name}, 'LATIN CAPITAL LETTER A WITH MACRON');
+ok($charinfo->{category}, 'Lu');
+ok($charinfo->{combining}, '0');
+ok($charinfo->{bidi}, 'L');
+ok($charinfo->{decomposition}, '0041 0304');
+ok($charinfo->{decimal}, '');
+ok($charinfo->{digit}, '');
+ok($charinfo->{numeric}, '');
+ok($charinfo->{mirrored}, 'N');
+ok($charinfo->{unicode10}, 'LATIN CAPITAL LETTER A MACRON');
+ok($charinfo->{comment}, '');
+ok($charinfo->{upper}, '');
+ok($charinfo->{lower}, '0101');
+ok($charinfo->{title}, '');
+ok($charinfo->{block}, 'Latin Extended-A');
+ok($charinfo->{script}, 'Latin');
# 0x0590 is in the Hebrew block but unused.
-%charinfo = charinfo(0x590);
-
-ok($charinfo{code}, undef);
-ok($charinfo{name}, undef);
-ok($charinfo{category}, undef);
-ok($charinfo{combining}, undef);
-ok($charinfo{bidi}, undef);
-ok($charinfo{decomposition}, undef);
-ok($charinfo{decimal}, undef);
-ok($charinfo{digit}, undef);
-ok($charinfo{numeric}, undef);
-ok($charinfo{mirrored}, undef);
-ok($charinfo{unicode10}, undef);
-ok($charinfo{comment}, undef);
-ok($charinfo{upper}, undef);
-ok($charinfo{lower}, undef);
-ok($charinfo{title}, undef);
-ok($charinfo{block}, undef);
-ok($charinfo{script}, undef);
+$charinfo = charinfo(0x590);
+
+ok($charinfo->{code}, undef);
+ok($charinfo->{name}, undef);
+ok($charinfo->{category}, undef);
+ok($charinfo->{combining}, undef);
+ok($charinfo->{bidi}, undef);
+ok($charinfo->{decomposition}, undef);
+ok($charinfo->{decimal}, undef);
+ok($charinfo->{digit}, undef);
+ok($charinfo->{numeric}, undef);
+ok($charinfo->{mirrored}, undef);
+ok($charinfo->{unicode10}, undef);
+ok($charinfo->{comment}, undef);
+ok($charinfo->{upper}, undef);
+ok($charinfo->{lower}, undef);
+ok($charinfo->{title}, undef);
+ok($charinfo->{block}, undef);
+ok($charinfo->{script}, undef);
# 0x05d0 is in the Hebrew block and used.
-%charinfo = charinfo(0x5d0);
-
-ok($charinfo{code}, '05D0');
-ok($charinfo{name}, 'HEBREW LETTER ALEF');
-ok($charinfo{category}, 'Lo');
-ok($charinfo{combining}, '0');
-ok($charinfo{bidi}, 'R');
-ok($charinfo{decomposition}, '');
-ok($charinfo{decimal}, '');
-ok($charinfo{digit}, '');
-ok($charinfo{numeric}, '');
-ok($charinfo{mirrored}, 'N');
-ok($charinfo{unicode10}, '');
-ok($charinfo{comment}, '');
-ok($charinfo{upper}, '');
-ok($charinfo{lower}, '');
-ok($charinfo{title}, '');
-ok($charinfo{block}, 'Hebrew');
-ok($charinfo{script}, 'Hebrew');
+$charinfo = charinfo(0x5d0);
+
+ok($charinfo->{code}, '05D0');
+ok($charinfo->{name}, 'HEBREW LETTER ALEF');
+ok($charinfo->{category}, 'Lo');
+ok($charinfo->{combining}, '0');
+ok($charinfo->{bidi}, 'R');
+ok($charinfo->{decomposition}, '');
+ok($charinfo->{decimal}, '');
+ok($charinfo->{digit}, '');
+ok($charinfo->{numeric}, '');
+ok($charinfo->{mirrored}, 'N');
+ok($charinfo->{unicode10}, '');
+ok($charinfo->{comment}, '');
+ok($charinfo->{upper}, '');
+ok($charinfo->{lower}, '');
+ok($charinfo->{title}, '');
+ok($charinfo->{block}, 'Hebrew');
+ok($charinfo->{script}, 'Hebrew');
use Unicode::UCD qw(charblock charscript);
ok(charblock(0x590), 'Hebrew');
ok(charscript(0x590), undef);
-%charinfo = charinfo(0xbe);
-
-ok($charinfo{code}, '00BE');
-ok($charinfo{name}, 'VULGAR FRACTION THREE QUARTERS');
-ok($charinfo{category}, 'No');
-ok($charinfo{combining}, '0');
-ok($charinfo{bidi}, 'ON');
-ok($charinfo{decomposition}, '<fraction> 0033 2044 0034');
-ok($charinfo{decimal}, '');
-ok($charinfo{digit}, '');
-ok($charinfo{numeric}, '3/4');
-ok($charinfo{mirrored}, 'N');
-ok($charinfo{unicode10}, 'FRACTION THREE QUARTERS');
-ok($charinfo{comment}, '');
-ok($charinfo{upper}, '');
-ok($charinfo{lower}, '');
-ok($charinfo{title}, '');
-ok($charinfo{block}, 'Latin-1 Supplement');
-ok($charinfo{script}, undef);
+$charinfo = charinfo(0xbe);
+
+ok($charinfo->{code}, '00BE');
+ok($charinfo->{name}, 'VULGAR FRACTION THREE QUARTERS');
+ok($charinfo->{category}, 'No');
+ok($charinfo->{combining}, '0');
+ok($charinfo->{bidi}, 'ON');
+ok($charinfo->{decomposition}, '<fraction> 0033 2044 0034');
+ok($charinfo->{decimal}, '');
+ok($charinfo->{digit}, '');
+ok($charinfo->{numeric}, '3/4');
+ok($charinfo->{mirrored}, 'N');
+ok($charinfo->{unicode10}, 'FRACTION THREE QUARTERS');
+ok($charinfo->{comment}, '');
+ok($charinfo->{upper}, '');
+ok($charinfo->{lower}, '');
+ok($charinfo->{title}, '');
+ok($charinfo->{block}, 'Latin-1 Supplement');
+ok($charinfo->{script}, undef);
use Unicode::UCD qw(charblocks charscripts);
-my %charblocks = charblocks();
+my $charblocks = charblocks();
-ok(exists $charblocks{Thai});
-ok($charblocks{Thai}->[0]->[0], hex('0e00'));
-ok(!exists $charblocks{PigLatin});
+ok(exists $charblocks->{Thai});
+ok($charblocks->{Thai}->[0]->[0], hex('0e00'));
+ok(!exists $charblocks->{PigLatin});
-my %charscripts = charscripts();
+my $charscripts = charscripts();
-ok(exists $charscripts{Armenian});
-ok($charscripts{Armenian}->[0]->[0], hex('0531'));
-ok(!exists $charscripts{PigLatin});
+ok(exists $charscripts->{Armenian});
+ok($charscripts->{Armenian}->[0]->[0], hex('0531'));
+ok(!exists $charscripts->{PigLatin});
my $charscript;
ok(!charinrange($ranges, "13f5"));
ok(Unicode::UCD::UnicodeVersion, 3.1);
+
+use Unicode::UCD qw(compexcl);
+
+ok(!compexcl(0x0100));
+ok( compexcl(0x0958));
+
+use Unicode::UCD qw(casefold);
+
+my $casefold;
+
+$casefold = casefold(0x41);
+
+ok($casefold->{code} eq '0041' &&
+ $casefold->{status} eq 'C' &&
+ $casefold->{mapping} eq '0061');
+
+$casefold = casefold(0xdf);
+
+ok($casefold->{code} eq '00DF' &&
+ $casefold->{status} eq 'F' &&
+ $casefold->{mapping} eq '0073 0073');
+
+ok(!casefold(0x20));
+
+use Unicode::UCD qw(casespec);
+
+my $casespec;
+
+ok(!casespec(0x41));
+
+$casespec = casespec(0xdf);
+
+ok($casespec->{code} eq '00DF' &&
+ $casespec->{lower} eq '00DF' &&
+ $casespec->{title} eq '0053 0073' &&
+ $casespec->{upper} eq '0053 0053' &&
+ $casespec->{condition} eq undef);
+
+$casespec = casespec(0x307);
+
+ok($casespec->{code} eq '0307' &&
+ $casespec->{lower} eq '0307' &&
+ $casespec->{title} eq '' &&
+ $casespec->{upper} eq '' &&
+ $casespec->{condition} eq 'lt AFTER_i');