From 82e740b6c60f868ffdb3cc1b574228fb4f53f66d Mon Sep 17 00:00:00 2001 From: Nicholas Clark Date: Mon, 27 Oct 2003 13:45:27 +0000 Subject: [PATCH] Sync with Unicode::Normalize 0.25 p4raw-id: //depot/perl@21550 --- MANIFEST | 4 + ext/Unicode/Normalize/Changes | 16 ++++ ext/Unicode/Normalize/Makefile.PL | 32 +------- ext/Unicode/Normalize/Normalize.pm | 125 ++++++++++++++++++++++------- ext/Unicode/Normalize/Normalize.xs | 160 ++++++++++++++++++++++++++++++++++--- ext/Unicode/Normalize/README | 29 ++++--- ext/Unicode/Normalize/t/fcdc.t | 71 ++++++++++++++++ ext/Unicode/Normalize/t/form.t | 71 ++++++++++++++++ ext/Unicode/Normalize/t/proto.t | 75 +++++++++++++++++ ext/Unicode/Normalize/t/split.t | 73 +++++++++++++++++ 10 files changed, 572 insertions(+), 84 deletions(-) create mode 100644 ext/Unicode/Normalize/t/fcdc.t create mode 100644 ext/Unicode/Normalize/t/form.t create mode 100644 ext/Unicode/Normalize/t/proto.t create mode 100644 ext/Unicode/Normalize/t/split.t diff --git a/MANIFEST b/MANIFEST index 40eb2ca..d7ad32c 100644 --- a/MANIFEST +++ b/MANIFEST @@ -758,8 +758,12 @@ ext/Unicode/Normalize/mkheader Unicode::Normalize ext/Unicode/Normalize/Normalize.pm Unicode::Normalize ext/Unicode/Normalize/Normalize.xs Unicode::Normalize ext/Unicode/Normalize/README Unicode::Normalize +ext/Unicode/Normalize/t/fcdc.t Unicode::Normalize +ext/Unicode/Normalize/t/form.t Unicode::Normalize ext/Unicode/Normalize/t/func.t Unicode::Normalize ext/Unicode/Normalize/t/norm.t Unicode::Normalize +ext/Unicode/Normalize/t/proto.t Unicode::Normalize +ext/Unicode/Normalize/t/split.t Unicode::Normalize ext/Unicode/Normalize/t/test.t Unicode::Normalize ext/util/make_ext Used by Makefile to execute extension Makefiles ext/XS/APItest/APItest.pm XS::APItest extension diff --git a/ext/Unicode/Normalize/Changes b/ext/Unicode/Normalize/Changes index 844ac39..74b87e7 100644 --- a/ext/Unicode/Normalize/Changes +++ b/ext/Unicode/Normalize/Changes @@ -1,5 +1,21 @@ Revision history for Perl extension Unicode::Normalize. +0.25 Mon Oct 6 22:26:03 2003 + - added form.t and proto.t. + +0.24 Sat Oct 4 17:57:10 2003 + - supports FCD and FCC (UTN #5): + FCD(), normalize('FCD'), checkFCD(), check('FCD'); + FCC(), normalize('FCC'), checkFCC(), check('FCC'). + - changed INSTALLATION (cf. README). + * Initial state of the distribution is changed to XSUB. To build + pure Perl, type before . + * The purePerl-XSUB converter is now provided as two perl + script files, named "enableXS" and "disableXS". + (no longer and .) + * simplified Makefile.PL. + - added fcdc.t and split.t. + 0.23 Sat Jun 28 20:38:10 2003 - bug fix: \0-terminate in compose() in XS. - tweak in pure perl: forced $codepoint to numeric (i.e. "+0065" to 65) diff --git a/ext/Unicode/Normalize/Makefile.PL b/ext/Unicode/Normalize/Makefile.PL index 8688132..2f37b62 100644 --- a/ext/Unicode/Normalize/Makefile.PL +++ b/ext/Unicode/Normalize/Makefile.PL @@ -1,35 +1,5 @@ use ExtUtils::MakeMaker; -# This code for XS-NOXS installer is shamelessly stolen -# after Gurusamy Sarathy's Data::Dumper. Thank you! - -# a bit modified. - -use File::Copy qw(); - -my $arg = $ARGV[0] || ""; - -if ($arg =~ /^no/i and -f "Normalize.xs") { - print STDERR "Disabling XS in sources...\n"; - - die "***** Failed, sources could be inconsistent! *****\n" - unless File::Copy::move('MANIFEST', 'MANIFEST.XS') - and File::Copy::move('MANIFEST.NXS', 'MANIFEST') - and File::Copy::move('Normalize.pm', 'Normalize.pm.XS') - and File::Copy::move('Normalize.xs', 'Normalize.xs.XS') - and File::Copy::move('Normalize.pm.NXS','Normalize.pm'); -} -if ($arg =~ /^xs/i and -f "Normalize.xs.XS") { - print STDERR "Enabling XS in sources...\n"; - - die "***** Failed, sources could be inconsistent! *****\n" - unless File::Copy::move('MANIFEST', 'MANIFEST.NXS') - and File::Copy::move('MANIFEST.XS', 'MANIFEST') - and File::Copy::move('Normalize.pm', 'Normalize.pm.NXS') - and File::Copy::move('Normalize.xs.XS', 'Normalize.xs') - and File::Copy::move('Normalize.pm.XS', 'Normalize.pm'); -} - my $clean = {}; if (-f "Normalize.xs") { @@ -40,7 +10,7 @@ if (-f "Normalize.xs") { } WriteMakefile( - 'INSTALLDIRS' => $] > 5.007 ? 'perl' : 'site', + 'INSTALLDIRS' => $] >= 5.007 ? 'perl' : 'site', 'NAME' => 'Unicode::Normalize', 'VERSION_FROM' => 'Normalize.pm', # finds $VERSION 'clean' => $clean, diff --git a/ext/Unicode/Normalize/Normalize.pm b/ext/Unicode/Normalize/Normalize.pm index 6e161d3..704303d 100644 --- a/ext/Unicode/Normalize/Normalize.pm +++ b/ext/Unicode/Normalize/Normalize.pm @@ -11,12 +11,11 @@ use strict; use warnings; use Carp; -our $VERSION = '0.23'; +our $VERSION = '0.25'; our $PACKAGE = __PACKAGE__; require Exporter; require DynaLoader; -require AutoLoader; our @ISA = qw(Exporter DynaLoader); our @EXPORT = qw( NFC NFD NFKC NFKD ); @@ -26,15 +25,22 @@ our @EXPORT_OK = qw( getCanon getCompat getComposite getCombinClass isExclusion isSingleton isNonStDecomp isComp2nd isComp_Ex isNFD_NO isNFC_NO isNFC_MAYBE isNFKD_NO isNFKC_NO isNFKC_MAYBE + FCD checkFCD FCC checkFCC composeContiguous + splitOnLastStarter ); our %EXPORT_TAGS = ( all => [ @EXPORT, @EXPORT_OK ], normalize => [ @EXPORT, qw/normalize decompose reorder compose/ ], check => [ qw/checkNFD checkNFKD checkNFC checkNFKC check/ ], + fast => [ qw/FCD checkFCD FCC checkFCC composeContiguous/ ], ); +###### + bootstrap Unicode::Normalize $VERSION; +###### + sub pack_U { return pack('U*', @_); } @@ -43,6 +49,11 @@ sub unpack_U { return unpack('U*', pack('U*').shift); } + +## +## normalization forms +## + use constant COMPAT => 1; sub NFD ($) { reorder(decompose($_[0])) } @@ -50,30 +61,49 @@ sub NFKD ($) { reorder(decompose($_[0], COMPAT)) } sub NFC ($) { compose(reorder(decompose($_[0]))) } sub NFKC ($) { compose(reorder(decompose($_[0], COMPAT))) } +sub FCD ($) { + my $str = shift; + return checkFCD($str) ? $str : NFD($str); +} +sub FCC ($) { composeContiguous(reorder(decompose($_[0]))) } + +our %formNorm = ( + NFC => \&NFC, C => \&NFC, + NFD => \&NFD, D => \&NFD, + NFKC => \&NFKC, KC => \&NFKC, + NFKD => \&NFKD, KD => \&NFKD, + FCD => \&FCD, FCC => \&FCC, +); + sub normalize($$) { my $form = shift; my $str = shift; - $form =~ s/^NF//; - return - $form eq 'D' ? NFD ($str) : - $form eq 'C' ? NFC ($str) : - $form eq 'KD' ? NFKD($str) : - $form eq 'KC' ? NFKC($str) : - croak $PACKAGE."::normalize: invalid form name: $form"; + return exists $formNorm{$form} + ? $formNorm{$form}->($str) + : croak $PACKAGE."::normalize: invalid form name: $form"; } + +## +## quick check +## + +our %formCheck = ( + NFC => \&checkNFC, C => \&checkNFC, + NFD => \&checkNFD, D => \&checkNFD, + NFKC => \&checkNFKC, KC => \&checkNFKC, + NFKD => \&checkNFKD, KD => \&checkNFKD, + FCD => \&checkFCD, FCC => \&checkFCC, +); + sub check($$) { my $form = shift; my $str = shift; - $form =~ s/^NF//; - return - $form eq 'D' ? checkNFD ($str) : - $form eq 'C' ? checkNFC ($str) : - $form eq 'KD' ? checkNFKD($str) : - $form eq 'KC' ? checkNFKC($str) : - croak $PACKAGE."::check: invalid form name: $form"; + return exists $formCheck{$form} + ? $formCheck{$form}->($str) + : croak $PACKAGE."::check: invalid form name: $form"; } 1; @@ -138,14 +168,31 @@ returns the Normalization Form KD (formed by compatibility decomposition). returns the Normalization Form KC (formed by compatibility decomposition followed by B composition). +=item C<$FCD_string = FCD($string)> + +If the given string is in FCD ("Fast C or D" form; cf. UTN #5), +returns it without modification; otherwise returns an FCD string. + +Note: FCD is not always unique, then plural forms may be equivalent +each other. C will return one of these equivalent forms. + +=item C<$FCC_string = FCC($string)> + +returns the FCC form ("Fast C Contiguous"; cf. UTN #5). + +Note: FCD is unique, as well as four normalization forms (NF*). + =item C<$normalized_string = normalize($form_name, $string)> As C<$form_name>, one of the following names must be given. - 'C' or 'NFC' for Normalization Form C - 'D' or 'NFD' for Normalization Form D - 'KC' or 'NFKC' for Normalization Form KC - 'KD' or 'NFKD' for Normalization Form KD + 'C' or 'NFC' for Normalization Form C (UAX #15) + 'D' or 'NFD' for Normalization Form D (UAX #15) + 'KC' or 'NFKC' for Normalization Form KC (UAX #15) + 'KD' or 'NFKD' for Normalization Form KD (UAX #15) + + 'FCD' for "Fast C or D" Form (UTN #5) + 'FCC' for "Fast C Contiguous" (UTN #5) =back @@ -194,7 +241,7 @@ you can get its NFC/NFKC string, saying =head2 Quick Check -(see Annex 8, UAX #15, and F) +(see Annex 8, UAX #15; and F) The following functions check whether the string is in that normalization form. @@ -222,6 +269,17 @@ returns C (C<1>) or C (C). returns C (C<1>), C (C), or C (C). +=item C<$result = checkFCD($string)> + +returns C (C<1>) or C (C). + +=item C<$result = checkFCC($string)> + +returns C (C<1>), C (C), or C (C). + +If a string is not in C, it must not be in . +So C should return C. + =item C<$result = check($form_name, $string)> returns C (C<1>), C (C), or C (C). @@ -232,23 +290,26 @@ C<$form_name> is alike to that for C. B -In the cases of NFD and NFKD, the answer must be either C or C. -The answer C may be returned in the cases of NFC and NFKC. +In the cases of NFD, NFKD, and FCD, the answer must be +either C or C. The answer C may be returned +in the cases of NFC, NFKC, and FCC. -A MAYBE-NFC/NFKC string should contain at least -one combining character or the like. -For example, C has +A C string should contain at least one combining character +or the like. For example, C has the MAYBE_NFC/MAYBE_NFKC property. + Both C and C will return C. C<"A\N{COMBINING ACUTE ACCENT}"> is not in NFC (its NFC is C<"\N{LATIN CAPITAL LETTER A WITH ACUTE}">), while C<"B\N{COMBINING ACUTE ACCENT}"> is in NFC. -If you want to check exactly, compare the string with its NFC/NFKC; i.e., +If you want to check exactly, compare the string with its NFC/NFKC/FCC; +i.e., - $string eq NFC($string) # more thorough than checkNFC($string) - $string eq NFKC($string) # more thorough than checkNFKC($string) + $string eq NFC($string) # thorough than checkNFC($string) + $string eq NFKC($string) # thorough than checkNFKC($string) + $string eq FCC($string) # thorough than checkFCC($string) =head2 Character Data @@ -319,7 +380,7 @@ C and other some functions: on request. =head1 AUTHOR -SADAHIRO Tomoyuki, ESADAHIRO@cpan.orgE +SADAHIRO Tomoyuki, http://homepage1.nifty.com/nomenclator/perl/ @@ -340,6 +401,10 @@ Unicode Normalization Forms - UAX #15 Derived Normalization Properties +=item http://www.unicode.org/notes/tn5/ + +Canonical Equivalence in Applications - UTN #5 + =back =cut diff --git a/ext/Unicode/Normalize/Normalize.xs b/ext/Unicode/Normalize/Normalize.xs index 987a839..04d0256 100644 --- a/ext/Unicode/Normalize/Normalize.xs +++ b/ext/Unicode/Normalize/Normalize.xs @@ -20,6 +20,12 @@ #define utf8n_to_uvuni utf8_to_uv #endif /* utf8n_to_uvuni */ +/* if utf8n_to_uvuni() sets retlen to 0 when flags = 0 */ +#define ErrRetlenIsZero "panic (Unicode::Normalize): zero-length character" + +/* utf8_hop() hops back before start. Maybe broken UTF-8 */ +#define ErrHopBeforeStart "panic (Unicode::Normalize): hopping before start" + /* At present, char > 0x10ffff are unaffected without complaint, right? */ #define VALID_UTF_MAX (0x10ffff) #define OVER_UTF_MAX(uv) (VALID_UTF_MAX < (uv)) @@ -187,9 +193,11 @@ decompose(arg, compat = &PL_sv_no) s = (U8*)SvPV(src,srclen); e = s + srclen; - for (p = s; p < e;) { + for (p = s; p < e; p += retlen) { uv = utf8n_to_uvuni(p, e - p, &retlen, 0); - p += retlen; + if (!retlen) + croak(ErrRetlenIsZero); + if (Hangul_IsS(uv)) sv_cat_decompHangul(dst, uv); else { @@ -197,7 +205,7 @@ decompose(arg, compat = &PL_sv_no) if (r) sv_catpv(dst, (char *)r); else - sv_catpvn(dst, (char *)p - retlen, retlen); + sv_catpvn(dst, (char *)p, retlen); } } RETVAL = dst; @@ -242,9 +250,13 @@ reorder(arg) STRLEN cc_len, cc_iter, cc_pos; uv = utf8n_to_uvuni(p, e - p, &retlen, 0); - curCC = getCombinClass(uv); + if (!retlen) + croak(ErrRetlenIsZero); p += retlen; + + + curCC = getCombinClass(uv); if (! (curCC && p < e)) continue; else @@ -257,10 +269,14 @@ reorder(arg) while (p < e) { uv = utf8n_to_uvuni(p, e - p, &retlen, 0); + if (!retlen) + croak(ErrRetlenIsZero); + p += retlen; + curCC = getCombinClass(uv); if (!curCC) break; - p += retlen; + cc_pos++; if (stk_cc_max <= cc_pos) { /* extend if need */ stk_cc_max = cc_pos + 1; @@ -294,6 +310,8 @@ SV* compose(arg) SV * arg PROTOTYPE: $ + ALIAS: + composeContiguous = 1 PREINIT: SV *src, *dst, *tmp; U8 *s, *p, *e, *d, *t, *tmp_start, curCC, preCC; @@ -324,6 +342,8 @@ compose(arg) for (p = s; p < e;) { if (beginning) { uvS = utf8n_to_uvuni(p, e - p, &retlen, 0); + if (!retlen) + croak(ErrRetlenIsZero); p += retlen; if (getCombinClass(uvS)) { /* no Starter found yet */ @@ -340,7 +360,10 @@ compose(arg) /* to the next Starter */ while (p < e) { uv = utf8n_to_uvuni(p, e - p, &retlen, 0); + if (!retlen) + croak(ErrRetlenIsZero); p += retlen; + curCC = getCombinClass(uv); if (preCC && preCC == curCC) { @@ -349,7 +372,8 @@ compose(arg) } else { uvComp = composite_uv(uvS, uv); - if (uvComp && ! isExclusion(uvComp) && preCC <= curCC) { + if (uvComp && ! isExclusion(uvComp) && + (ix ? (t == tmp_start) : (preCC <= curCC))) { STRLEN leftcur, rightcur, dstcur; leftcur = UNISKIP(uvComp); rightcur = UNISKIP(uvS) + UNISKIP(uv); @@ -385,7 +409,6 @@ compose(arg) RETVAL - void checkNFD(arg) SV * arg @@ -397,7 +420,7 @@ checkNFD(arg) SV *src; STRLEN srclen, retlen; U8 *s, *e, *p, curCC, preCC; - PPCODE: + CODE: if (SvUTF8(arg)) { src = arg; } else { @@ -411,6 +434,9 @@ checkNFD(arg) preCC = 0; for (p = s; p < e; p += retlen) { uv = utf8n_to_uvuni(p, e - p, &retlen, 0); + if (!retlen) + croak(ErrRetlenIsZero); + curCC = getCombinClass(uv); if (preCC > curCC && curCC != 0) /* canonical ordering violated */ XSRETURN_NO; @@ -434,7 +460,7 @@ checkNFC(arg) STRLEN srclen, retlen; U8 *s, *e, *p, curCC, preCC; bool isMAYBE; - PPCODE: + CODE: if (SvUTF8(arg)) { src = arg; } else { @@ -449,6 +475,9 @@ checkNFC(arg) isMAYBE = FALSE; for (p = s; p < e; p += retlen) { uv = utf8n_to_uvuni(p, e - p, &retlen, 0); + if (!retlen) + croak(ErrRetlenIsZero); + curCC = getCombinClass(uv); if (preCC > curCC && curCC != 0) /* canonical ordering violated */ @@ -479,6 +508,78 @@ checkNFC(arg) +void +checkFCD(arg) + SV * arg + PROTOTYPE: $ + ALIAS: + checkFCC = 1 + PREINIT: + UV uv, uvLead, uvTrail; + SV *src; + STRLEN srclen, retlen, canlen, canret; + U8 *s, *e, *p, curCC, preCC; + U8 *sCan, *pCan, *eCan; + bool isMAYBE; + CODE: + if (SvUTF8(arg)) { + src = arg; + } else { + src = sv_mortalcopy(arg); + sv_utf8_upgrade(src); + } + + s = (U8*)SvPV(src,srclen); + e = s + srclen; + + preCC = 0; + isMAYBE = FALSE; + for (p = s; p < e; p += retlen) { + uv = utf8n_to_uvuni(p, e - p, &retlen, 0); + if (!retlen) + croak(ErrRetlenIsZero); + + sCan = (U8*) dec_canonical(uv); + + if (sCan) { + canlen = (STRLEN)strlen((char *) sCan); + uvLead = utf8n_to_uvuni(sCan, canlen, &canret, 0); + } + else { + uvLead = uv; + } + + curCC = getCombinClass(uvLead); + + if (curCC != 0 && curCC < preCC) /* canonical ordering violated */ + XSRETURN_NO; + + if (ix) { + if (isExclusion(uv) || isSingleton(uv) || isNonStDecomp(uv)) + XSRETURN_NO; + else if (isComp2nd(uv)) + isMAYBE = TRUE; + } + + if (sCan) { + eCan = sCan + canlen; + pCan = utf8_hop(eCan, -1); + if (pCan < sCan) + croak(ErrHopBeforeStart); + uvTrail = utf8n_to_uvuni(pCan, eCan - pCan, &canret, 0); + preCC = getCombinClass(uvTrail); + } + else { + preCC = curCC; + } + } + if (isMAYBE) + XSRETURN_UNDEF; + else + XSRETURN_YES; + + + U8 getCombinClass(uv) UV uv @@ -515,7 +616,7 @@ isNFD_NO(uv) PROTOTYPE: $ ALIAS: isNFKD_NO = 1 - PPCODE: + CODE: if (Hangul_IsS(uv) || (ix ? dec_compat(uv) : dec_canonical(uv))) XSRETURN_YES; /* NFD_NO or NFKD_NO */ else @@ -530,7 +631,7 @@ isComp_Ex(uv) ALIAS: isNFC_NO = 0 isNFKC_NO = 1 - PPCODE: + CODE: if (isExclusion(uv) || isSingleton(uv) || isNonStDecomp(uv)) XSRETURN_YES; /* NFC_NO or NFKC_NO */ else if (ix) { @@ -587,3 +688,40 @@ getCanon(uv) OUTPUT: RETVAL + +void +splitOnLastStarter(arg) + SV * arg + PREINIT: + UV uv; + SV *src, *svp; + STRLEN srclen, retlen; + U8 *s, *e, *p; + PPCODE: + if (SvUTF8(arg)) { + src = arg; + } else { + src = sv_mortalcopy(arg); + sv_utf8_upgrade(src); + } + + s = (U8*)SvPV(src,srclen); + e = s + srclen; + + for (p = e; s < p; ) { + p = utf8_hop(p, -1); + if (p < s) + croak(ErrHopBeforeStart); + uv = utf8n_to_uvuni(p, e - p, &retlen, 0); + if (getCombinClass(uv) == 0) /* Last Starter found */ + break; + } + + svp = sv_2mortal(newSVpvn((char*)s, p - s)); + SvUTF8_on(svp); + XPUSHs(svp); + + svp = sv_2mortal(newSVpvn((char*)p, e - p)); + SvUTF8_on(svp); + XPUSHs(svp); + diff --git a/ext/Unicode/Normalize/README b/ext/Unicode/Normalize/README index 5392df4..4fbdfe1 100644 --- a/ext/Unicode/Normalize/README +++ b/ext/Unicode/Normalize/README @@ -1,4 +1,4 @@ -Unicode/Normalize version 0.22 +Unicode/Normalize version 0.25 =================================== Unicode::Normalize - Unicode Normalization Forms @@ -24,35 +24,37 @@ SYNOPSIS INSTALLATION -Perl 5.6.1 or later +Perl 5.6.1 or later (Caution: Perl 5.6.0 is not recommended.) -To install this module type the following: +To install this module (XSUB: needs a C compiler), type the following: perl Makefile.PL make make test make install -If you have a C compiler and want to use the XS version, -type the following: +If you want to install pure Perl (i.e. no-XSUB), +type the following (!! "disableXS" must run before "Makefile.PL" !!): - perl Makefile.PL xs + perl disableXS + perl Makefile.PL make make test make install -If you decide to install the NoXS version after trying to build the XS, -type the following: +After building no-XSUB, if you decide to install XSUB, +type the following (!! "enableXS" must run before "Makefile.PL" !!): make clean - perl Makefile.PL noxs + perl enableXS + perl Makefile.PL make make test make install DEPENDENCIES -This module requires these other modules and libraries: +This module requires other modules and libraries following: Carp Exporter @@ -70,14 +72,17 @@ CAVEAT (2) After these unicore/*.* files are updated. - In the case of an XS version: + In the case of an XS edition: You must rebuild the module, as the data will be compiled on building. - In the case of a NoXS version: + In the case of a pure Perl edition: Rebuilding is not necessary, as the data will be read on requirement. +(3) Pure Perl edition, Normalize.pmN, may work without any other file + in this distribution (it must be renamed Normalize.pm, though) + COPYRIGHT AND LICENCE SADAHIRO Tomoyuki diff --git a/ext/Unicode/Normalize/t/fcdc.t b/ext/Unicode/Normalize/t/fcdc.t new file mode 100644 index 0000000..ea10a64 --- /dev/null +++ b/ext/Unicode/Normalize/t/fcdc.t @@ -0,0 +1,71 @@ + +BEGIN { + unless ("A" eq pack('U', 0x41)) { + print "1..0 # Unicode::Normalize " . + "cannot stringify a Unicode code point\n"; + exit 0; + } +} + +BEGIN { + if ($ENV{PERL_CORE}) { + chdir('t') if -d 't'; + @INC = $^O eq 'MacOS' ? qw(::lib) : qw(../lib); + } +} + +######################### + +use Test; +use strict; +use warnings; +BEGIN { plan tests => 35 }; +use Unicode::Normalize qw(:all); +ok(1); # If we made it this far, we're ok. + +sub _pack_U { Unicode::Normalize::pack_U(@_) } +sub _unpack_U { Unicode::Normalize::unpack_U(@_) } +sub answer { defined $_[0] ? $_[0] ? "YES" : "NO" : "MAYBE" } + +######################### + +ok(answer(checkFCD('')), 'YES'); +ok(answer(checkFCD('A')), 'YES'); +ok(answer(checkFCD("\x{030A}")), 'YES'); # 030A;COMBINING RING ABOVE +ok(answer(checkFCD("\x{0327}")), 'YES'); # 0327;COMBINING CEDILLA +ok(answer(checkFCD(_pack_U(0x00C5))), 'YES'); # A with ring above +ok(answer(checkFCD(_pack_U(0x41, 0x30A))), 'YES'); # A+ring +ok(answer(checkFCD(_pack_U(0x41, 0x327, 0x30A))), 'YES'); # A+cedilla+ring +ok(answer(checkFCD(_pack_U(0x41, 0x30A, 0x327))), 'NO'); # A+ring+cedilla +ok(answer(checkFCD(_pack_U(0xC5, 0x0327))), 'NO'); # A-ring+cedilla +ok(answer(checkNFC(_pack_U(0xC5, 0x0327))), 'MAYBE'); # NFC: A-ring+cedilla +ok(answer(check("FCD", _pack_U(0xC5, 0x0327))), 'NO'); +ok(answer(check("NFC", _pack_U(0xC5, 0x0327))), 'MAYBE'); +ok(answer(checkFCD("\x{AC01}\x{1100}\x{1161}")), 'YES'); # hangul +ok(answer(checkFCD("\x{212B}\x{F900}")), 'YES'); # compat + +ok(FCD(''), ""); +ok(FCC(''), ""); + +ok(FCD('A'), "A"); +ok(FCC('A'), "A"); + +ok(answer(checkFCD(_pack_U(0x1EA7, 0x05AE, 0x0315, 0x0062))), "NO"); +ok(answer(checkFCC(_pack_U(0x1EA7, 0x05AE, 0x0315, 0x0062))), "NO"); + +ok(FCC(_pack_U(0xC5, 0x327)), _pack_U(0x41, 0x327, 0x30A)); +ok(FCC(_pack_U(0x45, 0x304, 0x300)), _pack_U(0x1E14)); +ok(FCC("\x{1100}\x{1161}\x{1100}\x{1173}\x{11AF}"), "\x{AC00}\x{AE00}"); + +ok(answer(checkFCC('')), 'YES'); +ok(answer(checkFCC('A')), 'YES'); +ok(answer(checkFCC("\x{030A}")), 'MAYBE'); # 030A;COMBINING RING ABOVE +ok(answer(checkFCC("\x{0327}")), 'MAYBE'); # 0327;COMBINING CEDILLA +ok(answer(checkFCC(_pack_U(0x00C5))), 'YES'); # A with ring above +ok(answer(checkFCC(_pack_U(0x41, 0x30A))), 'MAYBE'); # A+ring +ok(answer(checkFCC(_pack_U(0x41, 0x327, 0x30A))), 'MAYBE'); # A+cedilla+ring +ok(answer(checkFCC(_pack_U(0x41, 0x30A, 0x327))), 'NO'); # A+ring+cedilla +ok(answer(checkFCC(_pack_U(0xC5, 0x0327))), 'NO'); # A-ring+cedilla +ok(answer(checkFCC("\x{AC01}\x{1100}\x{1161}")), 'MAYBE'); # hangul +ok(answer(checkFCC("\x{212B}\x{F900}")), 'NO'); # compat + diff --git a/ext/Unicode/Normalize/t/form.t b/ext/Unicode/Normalize/t/form.t new file mode 100644 index 0000000..4e9b885 --- /dev/null +++ b/ext/Unicode/Normalize/t/form.t @@ -0,0 +1,71 @@ + +BEGIN { + unless ("A" eq pack('U', 0x41)) { + print "1..0 # Unicode::Normalize " . + "cannot stringify a Unicode code point\n"; + exit 0; + } +} + +BEGIN { + if ($ENV{PERL_CORE}) { + chdir('t') if -d 't'; + @INC = $^O eq 'MacOS' ? qw(::lib) : qw(../lib); + } +} + +######################### + +use Test; +use strict; +use warnings; +BEGIN { plan tests => 37 }; +use Unicode::Normalize qw(:all); +ok(1); # If we made it this far, we're ok. + +sub answer { defined $_[0] ? $_[0] ? "YES" : "NO" : "MAYBE" } + +######################### + +ok(NFD ("\x{304C}\x{FF76}"), "\x{304B}\x{3099}\x{FF76}"); +ok(NFC ("\x{304C}\x{FF76}"), "\x{304C}\x{FF76}"); +ok(NFKD("\x{304C}\x{FF76}"), "\x{304B}\x{3099}\x{30AB}"); +ok(NFKC("\x{304C}\x{FF76}"), "\x{304C}\x{30AB}"); + +ok(answer(checkNFD ("\x{304C}")), "NO"); +ok(answer(checkNFC ("\x{304C}")), "YES"); +ok(answer(checkNFKD("\x{304C}")), "NO"); +ok(answer(checkNFKC("\x{304C}")), "YES"); +ok(answer(checkNFD ("\x{FF76}")), "YES"); +ok(answer(checkNFC ("\x{FF76}")), "YES"); +ok(answer(checkNFKD("\x{FF76}")), "NO"); +ok(answer(checkNFKC("\x{FF76}")), "NO"); + +ok(normalize('D', "\x{304C}\x{FF76}"), "\x{304B}\x{3099}\x{FF76}"); +ok(normalize('C', "\x{304C}\x{FF76}"), "\x{304C}\x{FF76}"); +ok(normalize('KD',"\x{304C}\x{FF76}"), "\x{304B}\x{3099}\x{30AB}"); +ok(normalize('KC',"\x{304C}\x{FF76}"), "\x{304C}\x{30AB}"); + +ok(answer(check('D', "\x{304C}")), "NO"); +ok(answer(check('C', "\x{304C}")), "YES"); +ok(answer(check('KD',"\x{304C}")), "NO"); +ok(answer(check('KC',"\x{304C}")), "YES"); +ok(answer(check('D' ,"\x{FF76}")), "YES"); +ok(answer(check('C' ,"\x{FF76}")), "YES"); +ok(answer(check('KD',"\x{FF76}")), "NO"); +ok(answer(check('KC',"\x{FF76}")), "NO"); + +ok(normalize('NFD', "\x{304C}\x{FF76}"), "\x{304B}\x{3099}\x{FF76}"); +ok(normalize('NFC', "\x{304C}\x{FF76}"), "\x{304C}\x{FF76}"); +ok(normalize('NFKD',"\x{304C}\x{FF76}"), "\x{304B}\x{3099}\x{30AB}"); +ok(normalize('NFKC',"\x{304C}\x{FF76}"), "\x{304C}\x{30AB}"); + +ok(answer(check('NFD', "\x{304C}")), "NO"); +ok(answer(check('NFC', "\x{304C}")), "YES"); +ok(answer(check('NFKD',"\x{304C}")), "NO"); +ok(answer(check('NFKC',"\x{304C}")), "YES"); +ok(answer(check('NFD' ,"\x{FF76}")), "YES"); +ok(answer(check('NFC' ,"\x{FF76}")), "YES"); +ok(answer(check('NFKD',"\x{FF76}")), "NO"); +ok(answer(check('NFKC',"\x{FF76}")), "NO"); + diff --git a/ext/Unicode/Normalize/t/proto.t b/ext/Unicode/Normalize/t/proto.t new file mode 100644 index 0000000..3c4298d --- /dev/null +++ b/ext/Unicode/Normalize/t/proto.t @@ -0,0 +1,75 @@ + +BEGIN { + unless ("A" eq pack('U', 0x41)) { + print "1..0 # Unicode::Normalize " . + "cannot stringify a Unicode code point\n"; + exit 0; + } +} + +BEGIN { + if ($ENV{PERL_CORE}) { + chdir('t') if -d 't'; + @INC = $^O eq 'MacOS' ? qw(::lib) : qw(../lib); + } +} + +######################### + +use Test; +use strict; +use warnings; +BEGIN { plan tests => 42 }; +use Unicode::Normalize qw(:all); +ok(1); # If we made it this far, we're ok. + +######################### + +# unary op. RING-CEDILLA +ok( "\x{30A}\x{327}" ne "\x{327}\x{30A}"); +ok(NFD "\x{30A}\x{327}" eq "\x{327}\x{30A}"); +ok(NFC "\x{30A}\x{327}" eq "\x{327}\x{30A}"); +ok(NFKD "\x{30A}\x{327}" eq "\x{327}\x{30A}"); +ok(NFKC "\x{30A}\x{327}" eq "\x{327}\x{30A}"); +ok(FCD "\x{30A}\x{327}" eq "\x{327}\x{30A}"); +ok(FCC "\x{30A}\x{327}" eq "\x{327}\x{30A}"); +ok(reorder "\x{30A}\x{327}" eq "\x{327}\x{30A}"); + +ok(prototype \&normalize,'$$'); +ok(prototype \&NFD, '$'); +ok(prototype \&NFC, '$'); +ok(prototype \&NFKD, '$'); +ok(prototype \&NFKC, '$'); +ok(prototype \&FCD, '$'); +ok(prototype \&FCC, '$'); + +ok(prototype \&check, '$$'); +ok(prototype \&checkNFD, '$'); +ok(prototype \&checkNFC, '$'); +ok(prototype \&checkNFKD,'$'); +ok(prototype \&checkNFKC,'$'); +ok(prototype \&checkFCD, '$'); +ok(prototype \&checkFCC, '$'); + +ok(prototype \&decompose, '$;$'); +ok(prototype \&reorder, '$'); +ok(prototype \&compose, '$'); +ok(prototype \&composeContiguous, '$'); + +ok(prototype \&getCanon, '$'); +ok(prototype \&getCompat, '$'); +ok(prototype \&getComposite, '$$'); +ok(prototype \&getCombinClass,'$'); +ok(prototype \&isExclusion, '$'); +ok(prototype \&isSingleton, '$'); +ok(prototype \&isNonStDecomp, '$'); +ok(prototype \&isComp2nd, '$'); +ok(prototype \&isComp_Ex, '$'); + +ok(prototype \&isNFD_NO, '$'); +ok(prototype \&isNFC_NO, '$'); +ok(prototype \&isNFC_MAYBE, '$'); +ok(prototype \&isNFKD_NO, '$'); +ok(prototype \&isNFKC_NO, '$'); +ok(prototype \&isNFKC_MAYBE, '$'); + diff --git a/ext/Unicode/Normalize/t/split.t b/ext/Unicode/Normalize/t/split.t new file mode 100644 index 0000000..03b599e --- /dev/null +++ b/ext/Unicode/Normalize/t/split.t @@ -0,0 +1,73 @@ + +BEGIN { + unless ("A" eq pack('U', 0x41)) { + print "1..0 # Unicode::Normalize " . + "cannot stringify a Unicode code point\n"; + exit 0; + } +} + +BEGIN { + if ($ENV{PERL_CORE}) { + chdir('t') if -d 't'; + @INC = $^O eq 'MacOS' ? qw(::lib) : qw(../lib); + } +} + +######################### + +use Test; +use strict; +use warnings; +BEGIN { plan tests => 14 }; +use Unicode::Normalize qw(:all); +ok(1); # If we made it this far, we're ok. + +sub _pack_U { Unicode::Normalize::pack_U(@_) } +sub _unpack_U { Unicode::Normalize::unpack_U(@_) } + +######################### + +our $proc; # before the last starter +our $unproc; # the last starter and after +# If string has no starter, entire string is set to $unproc. + +# When you have $normalized string and $unnormalized string following, +# a simple concatenation +# C<$concat = $normalized . normalize($form, $unnormalized)> +# is wrong. Instead of it, like this: +# +# ($processed, $unprocessed) = splitOnLastStarter($normalized); +# $concat = $processed . normalize($form, $unprocessed.$unnormalized); + +($proc, $unproc) = splitOnLastStarter(""); +ok($proc, ""); +ok($unproc, ""); + +($proc, $unproc) = splitOnLastStarter("A"); +ok($proc, ""); +ok($unproc, "A"); + +($proc, $unproc) = splitOnLastStarter(_pack_U(0x41, 0x300, 0x327, 0x42)); +ok($proc, _pack_U(0x41, 0x300, 0x327)); +ok($unproc, "B"); + +($proc, $unproc) = splitOnLastStarter(_pack_U(0x4E00, 0x41, 0x301)); +ok($proc, _pack_U(0x4E00)); +ok($unproc, _pack_U(0x41, 0x301)); + +($proc, $unproc) = splitOnLastStarter(_pack_U(0x302, 0x301, 0x300)); +ok($proc, ""); +ok($unproc, _pack_U(0x302, 0x301, 0x300)); + +our $ka_grave = _pack_U(0x41, 0, 0x42, 0x304B, 0x300); +our $dakuten = _pack_U(0x3099); +our $ga_grave = _pack_U(0x41, 0, 0x42, 0x304C, 0x300); + +our ($p, $u) = splitOnLastStarter($ka_grave); +our $concat = $p . NFC($u.$dakuten); + +ok(NFC($ka_grave.$dakuten) eq $ga_grave); +ok(NFC($ka_grave).NFC($dakuten) ne $ga_grave); +ok($concat eq $ga_grave); + -- 2.7.4