From 2a204b451f774baa0cfb5a7a330513820530f0bf Mon Sep 17 00:00:00 2001 From: Jarkko Hietaniemi Date: Fri, 30 Nov 2001 13:41:15 +0000 Subject: [PATCH] Upgrade to Unicode::Normalize 0.12. p4raw-id: //depot/perl@13382 --- ext/Unicode/Normalize/Changes | 9 +++ ext/Unicode/Normalize/Normalize.pm | 148 ++++++++++++++++++++++++++++++++++--- ext/Unicode/Normalize/Normalize.xs | 62 +++++++++------- ext/Unicode/Normalize/README | 6 +- ext/Unicode/Normalize/mkheader | 23 +++++- ext/Unicode/Normalize/t/func.t | 26 ++++--- ext/Unicode/Normalize/t/norm.t | 2 +- ext/Unicode/Normalize/t/test.t | 5 +- 8 files changed, 226 insertions(+), 55 deletions(-) diff --git a/ext/Unicode/Normalize/Changes b/ext/Unicode/Normalize/Changes index bf17449..4426648 100644 --- a/ext/Unicode/Normalize/Changes +++ b/ext/Unicode/Normalize/Changes @@ -1,5 +1,14 @@ Revision history for Perl extension Unicode::Normalize. +0.12 Wed Nov 29 22:49:02 2001 + - documentation in .pod is appended to .pm and the .pod is removed. + (only POD in NON-XS refers to Lingua::KO::Hangul::Util.) + +0.11 Sat Nov 24 10:18:38 2001 + - documentation of some functions for character data. + - Change 12909: by Jarkko Hietaniemi. + - Change 13228: by Peter Prymmer. + 0.10 Sat Nov 03 16:30:20 2001 - The XS version is now independent of Lingua::KO::Hangul::Util. (though the Non-XS version still requires that.) diff --git a/ext/Unicode/Normalize/Normalize.pm b/ext/Unicode/Normalize/Normalize.pm index a583425..f416c58 100644 --- a/ext/Unicode/Normalize/Normalize.pm +++ b/ext/Unicode/Normalize/Normalize.pm @@ -5,7 +5,7 @@ use strict; use warnings; use Carp; -our $VERSION = '0.10'; +our $VERSION = '0.12'; our $PACKAGE = __PACKAGE__; require Exporter; @@ -14,8 +14,10 @@ require AutoLoader; our @ISA = qw(Exporter DynaLoader); our @EXPORT = qw( NFC NFD NFKC NFKD ); -our @EXPORT_OK = qw( normalize decompose reorder compose - getCanon getCompat getComposite getCombinClass getExclusion); +our @EXPORT_OK = qw( + normalize decompose reorder compose + getCanon getCompat getComposite getCombinClass isExclusion +); our %EXPORT_TAGS = ( all => [ @EXPORT, @EXPORT_OK ] ); bootstrap Unicode::Normalize $VERSION; @@ -23,23 +25,147 @@ bootstrap Unicode::Normalize $VERSION; use constant CANON => 0; use constant COMPAT => 1; -sub NFD ($) { reorder(decompose($_[0], CANON)) } - +sub NFD ($) { reorder(decompose($_[0], CANON )) } sub NFKD ($) { reorder(decompose($_[0], COMPAT)) } -sub NFC ($) { compose(reorder(decompose($_[0], CANON))) } - +sub NFC ($) { compose(reorder(decompose($_[0], CANON ))) } sub NFKC ($) { compose(reorder(decompose($_[0], COMPAT))) } sub normalize($$) { my $form = shift; - $form eq 'D' || $form eq 'NFD' ? NFD ($_[0]) : - $form eq 'C' || $form eq 'NFC' ? NFC ($_[0]) : - $form eq 'KD' || $form eq 'NFKD' ? NFKD($_[0]) : - $form eq 'KC' || $form eq 'NFKC' ? NFKC($_[0]) : + $form =~ s/NF//; + $form eq 'D' ? NFD ($_[0]) : + $form eq 'C' ? NFC ($_[0]) : + $form eq 'KD' ? NFKD($_[0]) : + $form eq 'KC' ? NFKC($_[0]) : croak $PACKAGE."::normalize: invalid form name: $form"; } 1; __END__ + +=head1 NAME + +Unicode::Normalize - normalized forms of Unicode text + +=head1 SYNOPSIS + + use Unicode::Normalize; + + $string_NFD = NFD($raw_string); # Normalization Form D + $string_NFC = NFC($raw_string); # Normalization Form C + $string_NFKD = NFKD($raw_string); # Normalization Form KD + $string_NFKC = NFKC($raw_string); # Normalization Form KC + + or + + use Unicode::Normalize 'normalize'; + + $string_NFD = normalize('D', $raw_string); # Normalization Form D + $string_NFC = normalize('C', $raw_string); # Normalization Form C + $string_NFKD = normalize('KD', $raw_string); # Normalization Form KD + $string_NFKC = normalize('KC', $raw_string); # Normalization Form KC + +=head1 DESCRIPTION + +=head2 Normalization + +=over 4 + +=item C<$string_NFD = NFD($raw_string)> + +returns the Normalization Form D (formed by canonical decomposition). + + +=item C<$string_NFC = NFC($raw_string)> + +returns the Normalization Form C (formed by canonical decomposition +followed by canonical composition). + +=item C<$string_NFKD = NFKD($raw_string)> + +returns the Normalization Form KD (formed by compatibility decomposition). + +=item C<$string_NFKC = NFKC($raw_string)> + +returns the Normalization Form KC (formed by compatibility decomposition +followed by B composition). + +=item C<$normalized_string = normalize($form_name, $raw_string)> + +As C<$form_name>, one of the following names must be given. + + 'C' or 'NFC' for Normalization Form C + 'D' or 'NFD' for Normalization Form D + 'KC' or 'NFKC' for Normalization Form KC + 'KD' or 'NFKD' for Normalization Form KD + +=back + +=head2 Character Data + +These functions are interface of character data used internally. +If you want only to get unicode normalization forms, +you need not to call them by yourself. + +=over 4 + +=item C<$canonical_decomposed = getCanon($codepoint)> + +=item C<$compatibility_decomposed = getCompat($codepoint)> + +If the character of the specified codepoint is canonically or +compatibility decomposable (including Hangul Syllables), +returns the B string equivalent to it. + +If it is not decomposable, returns undef. + +=item C<$uv_composite = getComposite($uv_here, $uv_next)> + +If the couple of two characters here and next (as codepoints) is composable +(including Hangul Jamo/Syllables and Exclusions), +returns the codepoint of the composite. + +If they are not composable, returns undef. + +=item C<$combining_class = getCombinClass($codepoint)> + +Returns the combining class as integer of the character. + +=item C<$is_exclusion = isExclusion($codepoint)> + +Returns a boolean whether the character of the specified codepoint is +a composition exclusion. + +=back + +=head2 EXPORT + +C, C, C, C: by default. + +C and other some functions: on request. + +=head1 AUTHOR + +SADAHIRO Tomoyuki, ESADAHIRO@cpan.orgE + + http://homepage1.nifty.com/nomenclator/perl/ + + Copyright(C) 2001, SADAHIRO Tomoyuki. Japan. All rights reserved. + + This program is free software; you can redistribute it and/or + modify it under the same terms as Perl itself. + +=head1 SEE ALSO + +=over 4 + +=item http://www.unicode.org/unicode/reports/tr15/ + +Unicode Normalization Forms - UAX #15 + +=back + +=cut + diff --git a/ext/Unicode/Normalize/Normalize.xs b/ext/Unicode/Normalize/Normalize.xs index aca0853..4d5b0b8 100644 --- a/ext/Unicode/Normalize/Normalize.xs +++ b/ext/Unicode/Normalize/Normalize.xs @@ -44,7 +44,7 @@ #define Hangul_TCount 28 #define Hangul_IsS(u) ((Hangul_SBase <= (u)) && ((u) <= Hangul_SFinal)) -#define Hangul_IsN(u) (! (((u) - Hangul_SBase) % Hangul_TCount)) +#define Hangul_IsN(u) (((u) - Hangul_SBase) % Hangul_TCount == 0) #define Hangul_IsLV(u) (Hangul_IsS(u) && Hangul_IsN(u)) #define Hangul_IsL(u) ((Hangul_LBase <= (u)) && ((u) <= Hangul_LFinal)) #define Hangul_IsV(u) ((Hangul_VBase <= (u)) && ((u) <= Hangul_VFinal)) @@ -86,7 +86,7 @@ U8* dec_compat (UV uv) return row ? row[uv & 0xff] : NULL; } -UV getComposite (UV uv, UV uv2) +UV composite_uv (UV uv, UV uv2) { UNF_complist ***plane, **row, *cell, *i; @@ -99,14 +99,14 @@ UV getComposite (UV uv, UV uv2) } if(Hangul_IsLV(uv) && Hangul_IsT(uv2)) { uv2 -= Hangul_TBase; /* tindex */ - return (uv + uv2); + return(uv + uv2); } plane = UNF_compos[uv >> 16]; if(! plane) return 0; row = plane[(uv >> 8) & 0xff]; - if(! row) return 0; + if(! row) return 0; cell = row[uv & 0xff]; - if(! cell) return 0; + if(! cell) return 0; for(i = cell; i->nextchar; i++) { if(uv2 == i->nextchar) return i->composite; } @@ -126,7 +126,7 @@ U8 getCombinClass (UV uv) void sv_cat_decompHangul (SV* sv, UV uv) { UV sindex, lindex, vindex, tindex; - U8 *t, temp[3 * UTF8_MAXLEN + 1]; + U8 *t, tmp[3 * UTF8_MAXLEN + 1]; if(! Hangul_IsS(uv)) return; @@ -135,27 +135,26 @@ void sv_cat_decompHangul (SV* sv, UV uv) vindex = (sindex % Hangul_NCount) / Hangul_TCount; tindex = sindex % Hangul_TCount; - t = temp; + t = tmp; t = uvuni_to_utf8(t, (lindex + Hangul_LBase)); t = uvuni_to_utf8(t, (vindex + Hangul_VBase)); if (tindex) t = uvuni_to_utf8(t, (tindex + Hangul_TBase)); *t = '\0'; - sv_catpvn(sv, (char *)temp, strlen((char *)temp)); + sv_catpvn(sv, (char *)tmp, strlen((char *)tmp)); } MODULE = Unicode::Normalize PACKAGE = Unicode::Normalize - SV* decompose(arg, compat) SV * arg SV * compat PROTOTYPE: $ PREINIT: + UV uv; SV *src, *dst; STRLEN srclen, dstlen, retlen; U8 *s, *e, *p, *d, *r; - UV uv; bool iscompat; CODE: if(SvUTF8(arg)) { @@ -164,7 +163,6 @@ decompose(arg, compat) src = sv_mortalcopy(arg); sv_utf8_upgrade(src); } - iscompat = SvTRUE(compat); dst = newSV(1); @@ -208,16 +206,18 @@ reorder(arg) s = (U8*)SvPV(src,srclen); e = s + srclen; + for(p = s; p < e;){ U8 *cc_in; STRLEN cc_len, cc_iter, cc_pos; uv = utf8n_to_uvchr(p, e - p, &retlen, 0); - p += retlen; - cc_pos = 0; curCC = getCombinClass(uv); + p += retlen; + if(! (curCC && p < e)) continue; else cc_in = p - retlen; + cc_pos = 0; stk_cc[cc_pos].cc = curCC; stk_cc[cc_pos].uv = uv; stk_cc[cc_pos].pos = cc_pos; @@ -255,7 +255,7 @@ reorder(arg) -void +SV* compose(arg) SV * arg PROTOTYPE: $ @@ -263,19 +263,20 @@ compose(arg) SV *src, *dst, *tmp; U8 *s, *p, *e, *d, *t, *tmp_start, curCC, preCC; UV uv, uvS, uvComp; - STRLEN srclen, dstlen, tmplen, dstcur, retlen; + STRLEN srclen, dstlen, tmplen, retlen; bool beginning = TRUE; - PPCODE: + CODE: if(SvUTF8(arg)) { src = arg; } else { src = sv_mortalcopy(arg); sv_utf8_upgrade(src); } + s = (U8*)SvPV(src, srclen); e = s + srclen; dstlen = srclen + 1; /* equal or shorter, XXX */ - dst = sv_2mortal(newSV(dstlen)); + dst = newSV(dstlen); (void)SvPOK_only(dst); SvUTF8_on(dst); d = (U8*)SvPVX(dst); @@ -311,10 +312,10 @@ compose(arg) preCC = curCC; t = uvuni_to_utf8(t, uv); } else { - uvComp = getComposite(uvS, uv); + uvComp = composite_uv(uvS, uv); /* S + C + S => S-S + C would be also blocked. */ - if( uvComp && ! getExclusion(uvComp) && preCC <= curCC) + if( uvComp && ! isExclusion(uvComp) && preCC <= curCC) { /* preCC not changed to curCC */ uvS = uvComp; @@ -326,16 +327,19 @@ compose(arg) } } } - d = uvuni_to_utf8(d, uvS); /* composed char */ + d = uvuni_to_utf8(d, uvS); /* starter (composed or not) */ if(tmplen = t - tmp_start) { /* uncomposed combining char */ t = (U8*)SvPVX(tmp); while(tmplen--) *d++ = *t++; } uvS = uv; } /* for */ - dstcur = d - (U8*)SvPVX(dst); - SvCUR_set(dst, dstcur); - XPUSHs(dst); + e = d; /* end of dst */ + d = (U8*)SvPVX(dst); + SvCUR_set(dst, e - d); + RETVAL = dst; + OUTPUT: + RETVAL @@ -344,13 +348,21 @@ getCombinClass(uv) UV uv bool -getExclusion(uv) +isExclusion(uv) UV uv -UV +SV* getComposite(uv, uv2) UV uv UV uv2 + PROTOTYPE: $$ + PREINIT: + UV comp; + CODE: + comp = composite_uv(uv, uv2); + RETVAL = comp ? newSVuv(comp) : &PL_sv_undef; + OUTPUT: + RETVAL SV* getCanon(uv) diff --git a/ext/Unicode/Normalize/README b/ext/Unicode/Normalize/README index 3f0c424..db54b1f 100644 --- a/ext/Unicode/Normalize/README +++ b/ext/Unicode/Normalize/README @@ -1,4 +1,4 @@ -Unicode/Normalize version 0.10 +Unicode/Normalize version 0.12 =================================== Unicode::Normalize - normalized forms of Unicode text @@ -48,11 +48,13 @@ Carp Exporter File::Copy File::Spec -Lingua::KO::Hangul::Util 0.06 unicore/CombiningClass.pl or unicode/CombiningClass.pl unicore/Decomposition.pl or unicode/Decomposition.pl unicore/CompExcl.txt or unicode/CompExcl.txt +and for the Non-XS version, in addition to the above, +Lingua::KO::Hangul::Util 0.06 + COPYRIGHT AND LICENCE SADAHIRO Tomoyuki, ESADAHIRO@cpan.orgE diff --git a/ext/Unicode/Normalize/mkheader b/ext/Unicode/Normalize/mkheader index ac6cac1..4283810 100644 --- a/ext/Unicode/Normalize/mkheader +++ b/ext/Unicode/Normalize/mkheader @@ -129,12 +129,26 @@ sub utf8len { croak "$PACKAGE: illegal char in the composite. utf-8 max is 0x10ffff."; } +sub utfebcdiclen { + my $uv = shift; + return $uv < 0xA0 ? 1 : + $uv < 0x400 ? 2 : + $uv < 0x4000 ? 3 : + $uv < 0x40000 ? 4 : + $uv < 0x110000 ? 5 : + croak "$PACKAGE: illegal char in the composite. utf-8 max is 0x10ffff."; +} + my $prefix = "UNF_"; my $structname = "${prefix}complist"; our (%Comp1st, %CompList); +my $errExpand = "$PACKAGE: A composable pair in %s " + . "is longer than the composite in bytes!\n" + . "%d + %d => %d\nQuit. Please inform the author..."; + foreach(sort keys %Compos) { my @a = unpack('U*', $_); my $val = $Compos{$_}; @@ -143,9 +157,10 @@ foreach(sort keys %Compos) { $CompList{ $name }{ $a[1] } = $val; if( utf8len($a[0]) + utf8len($a[1]) < utf8len($val) ) { - croak "$PACKAGE: " - . "composable pair is longer than the composite in bytes!\n" - . sprintf("%d + %d => %d", $a[0], $a[1], $val); + croak sprintf($errExpand, "utf-8", $a[0], $a[1], $val); + } + if( utfebcdiclen($a[0]) + utfebcdiclen($a[1]) < utfebcdiclen($val)) { + croak sprintf($errExpand, "utf-ebcdic", $a[0], $a[1], $val); } } @@ -168,7 +183,7 @@ my $file = "unfexc.h"; open FH, ">$file" or croak "$PACKAGE: $file can't be made"; binmode FH; select FH; -print "bool getExclusion (UV uv) \n{\nreturn\n\t"; +print "bool isExclusion (UV uv) \n{\nreturn\n\t"; while(@Exclus) { my $cur = shift @Exclus; diff --git a/ext/Unicode/Normalize/t/func.t b/ext/Unicode/Normalize/t/func.t index 8907634..fbbcb28 100644 --- a/ext/Unicode/Normalize/t/func.t +++ b/ext/Unicode/Normalize/t/func.t @@ -24,6 +24,7 @@ print ! defined getCanon( 0) && getCanon(0x00EF) eq pack('U*', 0x0069, 0x0308) && getCanon(0x304C) eq pack('U*', 0x304B, 0x3099) && getCanon(0x1EA4) eq pack('U*', 0x0041, 0x0302, 0x0301) + && getCanon(0x1F82) eq "\x{03B1}\x{0313}\x{0300}\x{0345}" && getCanon(0x1FAF) eq pack('U*', 0x03A9, 0x0314, 0x0342, 0x0345) && getCanon(0xAC00) eq pack('U*', 0x1100, 0x1161) && getCanon(0xAE00) eq pack('U*', 0x1100, 0x1173, 0x11AF) @@ -38,6 +39,7 @@ print ! defined getCompat( 0) && getCompat(0x00EF) eq pack('U*', 0x0069, 0x0308) && getCompat(0x304C) eq pack('U*', 0x304B, 0x3099) && getCompat(0x1EA4) eq pack('U*', 0x0041, 0x0302, 0x0301) + && getCompat(0x1F82) eq pack('U*', 0x03B1, 0x0313, 0x0300, 0x0345) && getCompat(0x1FAF) eq pack('U*', 0x03A9, 0x0314, 0x0342, 0x0345) && getCompat(0x212C) eq pack('U*', 0x0042) && getCompat(0x3243) eq pack('U*', 0x0028, 0x81F3, 0x0029) @@ -46,12 +48,13 @@ print ! defined getCompat( 0) && getCompat(0xFA2D) eq pack('U*', 0x9DB4) ? "ok" : "not ok", " 4\n"; -print ! getComposite( 0, 0) - && ! getComposite( 0, 41) - && ! getComposite(41, 0) - && ! getComposite(41, 41) - && ! getComposite(12, 0x0300) - && ! getComposite(0x0055, 0xFF00) +print ! defined getComposite( 0, 0) + && ! defined getComposite( 0, 41) + && ! defined getComposite(41, 0) + && ! defined getComposite(41, 41) + && ! defined getComposite(12, 0x0300) + && ! defined getComposite(0x0055, 0xFF00) + && 0x00C0 == getComposite(0x0041, 0x0300) && 0x00D9 == getComposite(0x0055, 0x0300) && 0x1E14 == getComposite(0x0112, 0x0300) && 0xAC00 == getComposite(0x1100, 0x1161) @@ -61,9 +64,10 @@ print ! getComposite( 0, 0) && 0xAE00 == getComposite(0xADF8, 0x11AF) ? "ok" : "not ok", " 5\n"; -print ! getExclusion( 0) - && ! getExclusion(41) - && getExclusion(2392) - && getExclusion(3907) - && getExclusion(64334) +print ! isExclusion( 0) + && ! isExclusion(41) + && isExclusion(2392) + && isExclusion(3907) + && isExclusion(64334) ? "ok" : "not ok", " 6\n"; + diff --git a/ext/Unicode/Normalize/t/norm.t b/ext/Unicode/Normalize/t/norm.t index 1de2e7f..970e671 100644 --- a/ext/Unicode/Normalize/t/norm.t +++ b/ext/Unicode/Normalize/t/norm.t @@ -21,7 +21,7 @@ sub hexNFC { } sub hexNFD { join " ", map sprintf("%04X", $_), - unpack 'U*', normalize 'D', pack 'U*', map hex(), split ' ', shift; + unpack 'U*', normalize 'NFD', pack 'U*', map hex(), split ' ', shift; } ok(hexNFC("0061 0315 0300 05AE 05C4 0062"), "00E0 05AE 05C4 0315 0062"); diff --git a/ext/Unicode/Normalize/t/test.t b/ext/Unicode/Normalize/t/test.t index 5544a3b..6c3e7ac 100644 --- a/ext/Unicode/Normalize/t/test.t +++ b/ext/Unicode/Normalize/t/test.t @@ -6,7 +6,7 @@ use Test; use strict; use warnings; -BEGIN { plan tests => 18 }; +BEGIN { plan tests => 20 }; use Unicode::Normalize; ok(1); # If we made it this far, we're ok. @@ -41,3 +41,6 @@ ok(hexNFD("0061 05AE 05C4 0300 0315 0062"), "0061 05AE 05C4 0300 0315 0062"); ok(hexNFC("0000 0041 0000 0000"), "0000 0041 0000 0000"); ok(hexNFD("0000 0041 0000 0000"), "0000 0041 0000 0000"); +# should be unary. +ok(NFC "\x{41}\x{0302}\x{0301}\x62" eq "\x{1EA4}\x62"); +ok(NFD "\x{E0}\x{AC00}" eq "\x{61}\x{0300}\x{1100}\x{1161}"); -- 2.7.4