From 983ffd37e39751798fdd14853511af238c5fe291 Mon Sep 17 00:00:00 2001 From: Jarkko Hietaniemi Date: Sun, 21 Oct 2001 16:12:08 +0000 Subject: [PATCH] Implement multicharacter case mappings where a single Unicode character can be mapped into several. p4raw-id: //depot/perl@12546 --- MANIFEST | 3 -- embed.h | 2 + embed.pl | 2 + global.sym | 1 + lib/unicore/To/Lower.pl | 106 ++++++++++++++++++++++++++++++++++++ lib/unicore/To/SpecLower.pl | 107 ------------------------------------ lib/unicore/To/SpecTitle.pl | 106 ------------------------------------ lib/unicore/To/SpecUpper.pl | 106 ------------------------------------ lib/unicore/To/Title.pl | 106 ++++++++++++++++++++++++++++++++++++ lib/unicore/To/Upper.pl | 106 ++++++++++++++++++++++++++++++++++++ lib/unicore/mktables | 30 ++++++++--- pod/perlfunc.pod | 13 +++-- pod/perlunicode.pod | 40 ++++++++++---- proto.h | 1 + t/op/lc.t | 128 ++++++++++++++++++++++++++++---------------- utf8.c | 64 ++++++++++++++-------- 16 files changed, 508 insertions(+), 413 deletions(-) delete mode 100644 lib/unicore/To/SpecLower.pl delete mode 100644 lib/unicore/To/SpecTitle.pl delete mode 100644 lib/unicore/To/SpecUpper.pl diff --git a/MANIFEST b/MANIFEST index 486a2b3..d21175d 100644 --- a/MANIFEST +++ b/MANIFEST @@ -1604,9 +1604,6 @@ lib/unicore/Scripts.txt Unicode character database lib/unicore/SpecCase.txt Unicode character database lib/unicore/To/Digit.pl Unicode character database lib/unicore/To/Lower.pl Unicode character database -lib/unicore/To/SpecLower.pl Unicode character database -lib/unicore/To/SpecTitle.pl Unicode character database -lib/unicore/To/SpecUpper.pl Unicode character database lib/unicore/To/Title.pl Unicode character database lib/unicore/To/Upper.pl Unicode character database lib/unicore/UCD.html Unicode character database diff --git a/embed.h b/embed.h index b591206..8c584a5 100644 --- a/embed.h +++ b/embed.h @@ -720,6 +720,7 @@ #define swash_fetch Perl_swash_fetch #define taint_env Perl_taint_env #define taint_proper Perl_taint_proper +#define to_utf8_case Perl_to_utf8_case #define to_utf8_lower Perl_to_utf8_lower #define to_utf8_upper Perl_to_utf8_upper #define to_utf8_title Perl_to_utf8_title @@ -2230,6 +2231,7 @@ #define swash_fetch(a,b,c) Perl_swash_fetch(aTHX_ a,b,c) #define taint_env() Perl_taint_env(aTHX) #define taint_proper(a,b) Perl_taint_proper(aTHX_ a,b) +#define to_utf8_case(a,b,c,d,e,f) Perl_to_utf8_case(aTHX_ a,b,c,d,e,f) #define to_utf8_lower(a,b,c) Perl_to_utf8_lower(aTHX_ a,b,c) #define to_utf8_upper(a,b,c) Perl_to_utf8_upper(aTHX_ a,b,c) #define to_utf8_title(a,b,c) Perl_to_utf8_title(aTHX_ a,b,c) diff --git a/embed.pl b/embed.pl index 8c3ba3c..6c20660 100755 --- a/embed.pl +++ b/embed.pl @@ -1818,6 +1818,8 @@ Ap |SV* |swash_init |char* pkg|char* name|SV* listsv \ Ap |UV |swash_fetch |SV *sv|U8 *ptr|bool do_utf8 Ap |void |taint_env Ap |void |taint_proper |const char* f|const char* s +Ap |UV |to_utf8_case |U8 *p|U8* ustrp|STRLEN *lenp \ + |SV **swash|char *normal|char *special Ap |UV |to_utf8_lower |U8 *p|U8* ustrp|STRLEN *lenp Ap |UV |to_utf8_upper |U8 *p|U8* ustrp|STRLEN *lenp Ap |UV |to_utf8_title |U8 *p|U8* ustrp|STRLEN *lenp diff --git a/global.sym b/global.sym index c5a9246..ede1f3d 100644 --- a/global.sym +++ b/global.sym @@ -470,6 +470,7 @@ Perl_swash_init Perl_swash_fetch Perl_taint_env Perl_taint_proper +Perl_to_utf8_case Perl_to_utf8_lower Perl_to_utf8_upper Perl_to_utf8_title diff --git a/lib/unicore/To/Lower.pl b/lib/unicore/To/Lower.pl index 0fd4f8d..ce89c8e 100644 --- a/lib/unicore/To/Lower.pl +++ b/lib/unicore/To/Lower.pl @@ -1,6 +1,112 @@ # !!!!!!! DO NOT EDIT THIS FILE !!!!!!! # This file is built by mktables from e.g. Unicode.txt. # Any changes made here will be lost! + +%utf8::ToSpecLower = ( +'223' => "\x{00DF}", +'329' => "\x{0149}", +'496' => "\x{01F0}", +'912' => "\x{0390}", +'944' => "\x{03B0}", +'1415' => "\x{0587}", +'7830' => "\x{1E96}", +'7831' => "\x{1E97}", +'7832' => "\x{1E98}", +'7833' => "\x{1E99}", +'7834' => "\x{1E9A}", +'8016' => "\x{1F50}", +'8018' => "\x{1F52}", +'8020' => "\x{1F54}", +'8022' => "\x{1F56}", +'8064' => "\x{1F80}", +'8065' => "\x{1F81}", +'8066' => "\x{1F82}", +'8067' => "\x{1F83}", +'8068' => "\x{1F84}", +'8069' => "\x{1F85}", +'8070' => "\x{1F86}", +'8071' => "\x{1F87}", +'8072' => "\x{1F80}", +'8073' => "\x{1F81}", +'8074' => "\x{1F82}", +'8075' => "\x{1F83}", +'8076' => "\x{1F84}", +'8077' => "\x{1F85}", +'8078' => "\x{1F86}", +'8079' => "\x{1F87}", +'8080' => "\x{1F90}", +'8081' => "\x{1F91}", +'8082' => "\x{1F92}", +'8083' => "\x{1F93}", +'8084' => "\x{1F94}", +'8085' => "\x{1F95}", +'8086' => "\x{1F96}", +'8087' => "\x{1F97}", +'8088' => "\x{1F90}", +'8089' => "\x{1F91}", +'8090' => "\x{1F92}", +'8091' => "\x{1F93}", +'8092' => "\x{1F94}", +'8093' => "\x{1F95}", +'8094' => "\x{1F96}", +'8095' => "\x{1F97}", +'8096' => "\x{1FA0}", +'8097' => "\x{1FA1}", +'8098' => "\x{1FA2}", +'8099' => "\x{1FA3}", +'8100' => "\x{1FA4}", +'8101' => "\x{1FA5}", +'8102' => "\x{1FA6}", +'8103' => "\x{1FA7}", +'8104' => "\x{1FA0}", +'8105' => "\x{1FA1}", +'8106' => "\x{1FA2}", +'8107' => "\x{1FA3}", +'8108' => "\x{1FA4}", +'8109' => "\x{1FA5}", +'8110' => "\x{1FA6}", +'8111' => "\x{1FA7}", +'8114' => "\x{1FB2}", +'8115' => "\x{1FB3}", +'8116' => "\x{1FB4}", +'8118' => "\x{1FB6}", +'8119' => "\x{1FB7}", +'8124' => "\x{1FB3}", +'8130' => "\x{1FC2}", +'8131' => "\x{1FC3}", +'8132' => "\x{1FC4}", +'8134' => "\x{1FC6}", +'8135' => "\x{1FC7}", +'8140' => "\x{1FC3}", +'8146' => "\x{1FD2}", +'8147' => "\x{1FD3}", +'8150' => "\x{1FD6}", +'8151' => "\x{1FD7}", +'8162' => "\x{1FE2}", +'8163' => "\x{1FE3}", +'8164' => "\x{1FE4}", +'8166' => "\x{1FE6}", +'8167' => "\x{1FE7}", +'8178' => "\x{1FF2}", +'8179' => "\x{1FF3}", +'8180' => "\x{1FF4}", +'8182' => "\x{1FF6}", +'8183' => "\x{1FF7}", +'8188' => "\x{1FF3}", +'64256' => "\x{FB00}", +'64257' => "\x{FB01}", +'64258' => "\x{FB02}", +'64259' => "\x{FB03}", +'64260' => "\x{FB04}", +'64261' => "\x{FB05}", +'64262' => "\x{FB06}", +'64275' => "\x{FB13}", +'64276' => "\x{FB14}", +'64277' => "\x{FB15}", +'64278' => "\x{FB16}", +'64279' => "\x{FB17}", +); + return <<'END'; 0041 0061 0042 0062 diff --git a/lib/unicore/To/SpecLower.pl b/lib/unicore/To/SpecLower.pl deleted file mode 100644 index 18c073b..0000000 --- a/lib/unicore/To/SpecLower.pl +++ /dev/null @@ -1,107 +0,0 @@ -# !!!!!!! DO NOT EDIT THIS FILE !!!!!!! -# This file is built by mktables from e.g. Unicode.txt. -# Any changes made here will be lost! -return <<'END'; -00DF 00DF -0149 0149 -01F0 01F0 -0390 0390 -03B0 03B0 -0587 0587 -1E96 1E96 -1E97 1E97 -1E98 1E98 -1E99 1E99 -1E9A 1E9A -1F50 1F50 -1F52 1F52 -1F54 1F54 -1F56 1F56 -1F80 1F80 -1F81 1F81 -1F82 1F82 -1F83 1F83 -1F84 1F84 -1F85 1F85 -1F86 1F86 -1F87 1F87 -1F88 1F80 -1F89 1F81 -1F8A 1F82 -1F8B 1F83 -1F8C 1F84 -1F8D 1F85 -1F8E 1F86 -1F8F 1F87 -1F90 1F90 -1F91 1F91 -1F92 1F92 -1F93 1F93 -1F94 1F94 -1F95 1F95 -1F96 1F96 -1F97 1F97 -1F98 1F90 -1F99 1F91 -1F9A 1F92 -1F9B 1F93 -1F9C 1F94 -1F9D 1F95 -1F9E 1F96 -1F9F 1F97 -1FA0 1FA0 -1FA1 1FA1 -1FA2 1FA2 -1FA3 1FA3 -1FA4 1FA4 -1FA5 1FA5 -1FA6 1FA6 -1FA7 1FA7 -1FA8 1FA0 -1FA9 1FA1 -1FAA 1FA2 -1FAB 1FA3 -1FAC 1FA4 -1FAD 1FA5 -1FAE 1FA6 -1FAF 1FA7 -1FB2 1FB2 -1FB3 1FB3 -1FB4 1FB4 -1FB6 1FB6 -1FB7 1FB7 -1FBC 1FB3 -1FC2 1FC2 -1FC3 1FC3 -1FC4 1FC4 -1FC6 1FC6 -1FC7 1FC7 -1FCC 1FC3 -1FD2 1FD2 -1FD3 1FD3 -1FD6 1FD6 -1FD7 1FD7 -1FE2 1FE2 -1FE3 1FE3 -1FE4 1FE4 -1FE6 1FE6 -1FE7 1FE7 -1FF2 1FF2 -1FF3 1FF3 -1FF4 1FF4 -1FF6 1FF6 -1FF7 1FF7 -1FFC 1FF3 -FB00 FB00 -FB01 FB01 -FB02 FB02 -FB03 FB03 -FB04 FB04 -FB05 FB05 -FB06 FB06 -FB13 FB13 -FB14 FB14 -FB15 FB15 -FB16 FB16 -FB17 FB17 -END diff --git a/lib/unicore/To/SpecTitle.pl b/lib/unicore/To/SpecTitle.pl deleted file mode 100644 index c3e1911..0000000 --- a/lib/unicore/To/SpecTitle.pl +++ /dev/null @@ -1,106 +0,0 @@ -# !!!!!!! DO NOT EDIT THIS FILE !!!!!!! -# This file is built by mktables from e.g. Unicode.txt. -# Any changes made here will be lost! -return <<'END'; -00DF 0053 0073 -0149 02BC 004E -01F0 004A 030C -0390 0399 0308 0301 -03B0 03A5 0308 0301 -0587 0535 0582 -1E96 0048 0331 -1E97 0054 0308 -1E98 0057 030A -1E99 0059 030A -1E9A 0041 02BE -1F50 03A5 0313 -1F52 03A5 0313 0300 -1F54 03A5 0313 0301 -1F56 03A5 0313 0342 -1F80 1F88 -1F81 1F89 -1F82 1F8A -1F83 1F8B -1F84 1F8C -1F85 1F8D -1F86 1F8E -1F87 1F8F -1F88 1F88 -1F89 1F89 -1F8A 1F8A -1F8B 1F8B -1F8C 1F8C -1F8D 1F8D -1F8E 1F8E -1F8F 1F8F -1F90 1F98 -1F91 1F99 -1F92 1F9A -1F93 1F9B -1F94 1F9C -1F95 1F9D -1F96 1F9E -1F97 1F9F -1F98 1F98 -1F99 1F99 -1F9A 1F9A -1F9B 1F9B -1F9C 1F9C -1F9D 1F9D -1F9E 1F9E -1F9F 1F9F -1FA0 1FA8 -1FA1 1FA9 -1FA2 1FAA -1FA3 1FAB -1FA4 1FAC -1FA5 1FAD -1FA6 1FAE -1FA7 1FAF -1FA8 1FA8 -1FA9 1FA9 -1FAA 1FAA -1FAB 1FAB -1FAC 1FAC -1FAD 1FAD -1FAE 1FAE -1FAF 1FAF -1FB2 1FBA 0345 -1FB3 1FBC -1FB4 0386 0345 -1FB6 0391 0342 -1FB7 0391 0342 0345 -1FBC 1FBC -1FC2 1FCA 0345 -1FC3 1FCC -1FC4 0389 0345 -1FC6 0397 0342 -1FC7 0397 0342 0345 -1FCC 1FCC -1FD2 0399 0308 0300 -1FD3 0399 0308 0301 -1FD6 0399 0342 -1FD7 0399 0308 0342 -1FE2 03A5 0308 0300 -1FE3 03A5 0308 0301 -1FE4 03A1 0313 -1FE6 03A5 0342 -1FE7 03A5 0308 0342 -1FF2 1FFA 0345 -1FF3 1FFC -1FF4 038F 0345 -1FF6 03A9 0342 -1FF7 03A9 0342 0345 -1FFC 1FFC -FB00 0046 0066 -FB01 0046 0069 -FB02 0046 006C -FB03 0046 0066 0069 -FB04 0046 0066 006C -FB05 FB06 0053 0074 -FB13 0544 0576 -FB14 0544 0565 -FB15 0544 056B -FB16 054E 0576 -FB17 0544 056D -END diff --git a/lib/unicore/To/SpecUpper.pl b/lib/unicore/To/SpecUpper.pl deleted file mode 100644 index e5af4b1..0000000 --- a/lib/unicore/To/SpecUpper.pl +++ /dev/null @@ -1,106 +0,0 @@ -# !!!!!!! DO NOT EDIT THIS FILE !!!!!!! -# This file is built by mktables from e.g. Unicode.txt. -# Any changes made here will be lost! -return <<'END'; -00DF 0053 0053 -0149 02BC 004E -01F0 004A 030C -0390 0399 0308 0301 -03B0 03A5 0308 0301 -0587 0535 0552 -1E96 0048 0331 -1E97 0054 0308 -1E98 0057 030A -1E99 0059 030A -1E9A 0041 02BE -1F50 03A5 0313 -1F52 03A5 0313 0300 -1F54 03A5 0313 0301 -1F56 03A5 0313 0342 -1F80 1F08 0399 -1F81 1F09 0399 -1F82 1F0A 0399 -1F83 1F0B 0399 -1F84 1F0C 0399 -1F85 1F0D 0399 -1F86 1F0E 0399 -1F87 1F0F 0399 -1F88 1F08 0399 -1F89 1F09 0399 -1F8A 1F0A 0399 -1F8B 1F0B 0399 -1F8C 1F0C 0399 -1F8D 1F0D 0399 -1F8E 1F0E 0399 -1F8F 1F0F 0399 -1F90 1F28 0399 -1F91 1F29 0399 -1F92 1F2A 0399 -1F93 1F2B 0399 -1F94 1F2C 0399 -1F95 1F2D 0399 -1F96 1F2E 0399 -1F97 1F2F 0399 -1F98 1F28 0399 -1F99 1F29 0399 -1F9A 1F2A 0399 -1F9B 1F2B 0399 -1F9C 1F2C 0399 -1F9D 1F2D 0399 -1F9E 1F2E 0399 -1F9F 1F2F 0399 -1FA0 1F68 0399 -1FA1 1F69 0399 -1FA2 1F6A 0399 -1FA3 1F6B 0399 -1FA4 1F6C 0399 -1FA5 1F6D 0399 -1FA6 1F6E 0399 -1FA7 1F6F 0399 -1FA8 1F68 0399 -1FA9 1F69 0399 -1FAA 1F6A 0399 -1FAB 1F6B 0399 -1FAC 1F6C 0399 -1FAD 1F6D 0399 -1FAE 1F6E 0399 -1FAF 1F6F 0399 -1FB2 1FBA 0399 -1FB3 0391 0399 -1FB4 0386 0399 -1FB6 0391 0342 -1FB7 0391 0342 0399 -1FBC 0391 0399 -1FC2 1FCA 0399 -1FC3 0397 0399 -1FC4 0389 0399 -1FC6 0397 0342 -1FC7 0397 0342 0399 -1FCC 0397 0399 -1FD2 0399 0308 0300 -1FD3 0399 0308 0301 -1FD6 0399 0342 -1FD7 0399 0308 0342 -1FE2 03A5 0308 0300 -1FE3 03A5 0308 0301 -1FE4 03A1 0313 -1FE6 03A5 0342 -1FE7 03A5 0308 0342 -1FF2 1FFA 0399 -1FF3 03A9 0399 -1FF4 038F 0399 -1FF6 03A9 0342 -1FF7 03A9 0342 0399 -1FFC 03A9 0399 -FB00 0046 0046 -FB01 0046 0049 -FB02 0046 004C -FB03 0046 0046 0049 -FB04 0046 0046 004C -FB05 FB06 0053 0054 -FB13 0544 0546 -FB14 0544 0535 -FB15 0544 053B -FB16 054E 0546 -FB17 0544 053D -END diff --git a/lib/unicore/To/Title.pl b/lib/unicore/To/Title.pl index 2fca353..3da9ca9 100644 --- a/lib/unicore/To/Title.pl +++ b/lib/unicore/To/Title.pl @@ -1,6 +1,112 @@ # !!!!!!! DO NOT EDIT THIS FILE !!!!!!! # This file is built by mktables from e.g. Unicode.txt. # Any changes made here will be lost! + +%utf8::ToSpecTitle = ( +'223' => "\x{0053}\x{0073}", +'329' => "\x{02BC}\x{004E}", +'496' => "\x{004A}\x{030C}", +'912' => "\x{0399}\x{0308}\x{0301}", +'944' => "\x{03A5}\x{0308}\x{0301}", +'1415' => "\x{0535}\x{0582}", +'7830' => "\x{0048}\x{0331}", +'7831' => "\x{0054}\x{0308}", +'7832' => "\x{0057}\x{030A}", +'7833' => "\x{0059}\x{030A}", +'7834' => "\x{0041}\x{02BE}", +'8016' => "\x{03A5}\x{0313}", +'8018' => "\x{03A5}\x{0313}\x{0300}", +'8020' => "\x{03A5}\x{0313}\x{0301}", +'8022' => "\x{03A5}\x{0313}\x{0342}", +'8064' => "\x{1F88}", +'8065' => "\x{1F89}", +'8066' => "\x{1F8A}", +'8067' => "\x{1F8B}", +'8068' => "\x{1F8C}", +'8069' => "\x{1F8D}", +'8070' => "\x{1F8E}", +'8071' => "\x{1F8F}", +'8072' => "\x{1F88}", +'8073' => "\x{1F89}", +'8074' => "\x{1F8A}", +'8075' => "\x{1F8B}", +'8076' => "\x{1F8C}", +'8077' => "\x{1F8D}", +'8078' => "\x{1F8E}", +'8079' => "\x{1F8F}", +'8080' => "\x{1F98}", +'8081' => "\x{1F99}", +'8082' => "\x{1F9A}", +'8083' => "\x{1F9B}", +'8084' => "\x{1F9C}", +'8085' => "\x{1F9D}", +'8086' => "\x{1F9E}", +'8087' => "\x{1F9F}", +'8088' => "\x{1F98}", +'8089' => "\x{1F99}", +'8090' => "\x{1F9A}", +'8091' => "\x{1F9B}", +'8092' => "\x{1F9C}", +'8093' => "\x{1F9D}", +'8094' => "\x{1F9E}", +'8095' => "\x{1F9F}", +'8096' => "\x{1FA8}", +'8097' => "\x{1FA9}", +'8098' => "\x{1FAA}", +'8099' => "\x{1FAB}", +'8100' => "\x{1FAC}", +'8101' => "\x{1FAD}", +'8102' => "\x{1FAE}", +'8103' => "\x{1FAF}", +'8104' => "\x{1FA8}", +'8105' => "\x{1FA9}", +'8106' => "\x{1FAA}", +'8107' => "\x{1FAB}", +'8108' => "\x{1FAC}", +'8109' => "\x{1FAD}", +'8110' => "\x{1FAE}", +'8111' => "\x{1FAF}", +'8114' => "\x{1FBA}\x{0345}", +'8115' => "\x{1FBC}", +'8116' => "\x{0386}\x{0345}", +'8118' => "\x{0391}\x{0342}", +'8119' => "\x{0391}\x{0342}\x{0345}", +'8124' => "\x{1FBC}", +'8130' => "\x{1FCA}\x{0345}", +'8131' => "\x{1FCC}", +'8132' => "\x{0389}\x{0345}", +'8134' => "\x{0397}\x{0342}", +'8135' => "\x{0397}\x{0342}\x{0345}", +'8140' => "\x{1FCC}", +'8146' => "\x{0399}\x{0308}\x{0300}", +'8147' => "\x{0399}\x{0308}\x{0301}", +'8150' => "\x{0399}\x{0342}", +'8151' => "\x{0399}\x{0308}\x{0342}", +'8162' => "\x{03A5}\x{0308}\x{0300}", +'8163' => "\x{03A5}\x{0308}\x{0301}", +'8164' => "\x{03A1}\x{0313}", +'8166' => "\x{03A5}\x{0342}", +'8167' => "\x{03A5}\x{0308}\x{0342}", +'8178' => "\x{1FFA}\x{0345}", +'8179' => "\x{1FFC}", +'8180' => "\x{038F}\x{0345}", +'8182' => "\x{03A9}\x{0342}", +'8183' => "\x{03A9}\x{0342}\x{0345}", +'8188' => "\x{1FFC}", +'64256' => "\x{0046}\x{0066}", +'64257' => "\x{0046}\x{0069}", +'64258' => "\x{0046}\x{006C}", +'64259' => "\x{0046}\x{0066}\x{0069}", +'64260' => "\x{0046}\x{0066}\x{006C}", +'64261' => "\x{0053}\x{0074}", +'64262' => "\x{0053}\x{0074}", +'64275' => "\x{0544}\x{0576}", +'64276' => "\x{0544}\x{0565}", +'64277' => "\x{0544}\x{056B}", +'64278' => "\x{054E}\x{0576}", +'64279' => "\x{0544}\x{056D}", +); + return <<'END'; 0061 0041 0062 0042 diff --git a/lib/unicore/To/Upper.pl b/lib/unicore/To/Upper.pl index bfdd4ea..a9c7a9f 100644 --- a/lib/unicore/To/Upper.pl +++ b/lib/unicore/To/Upper.pl @@ -1,6 +1,112 @@ # !!!!!!! DO NOT EDIT THIS FILE !!!!!!! # This file is built by mktables from e.g. Unicode.txt. # Any changes made here will be lost! + +%utf8::ToSpecUpper = ( +'223' => "\x{0053}\x{0053}", +'329' => "\x{02BC}\x{004E}", +'496' => "\x{004A}\x{030C}", +'912' => "\x{0399}\x{0308}\x{0301}", +'944' => "\x{03A5}\x{0308}\x{0301}", +'1415' => "\x{0535}\x{0552}", +'7830' => "\x{0048}\x{0331}", +'7831' => "\x{0054}\x{0308}", +'7832' => "\x{0057}\x{030A}", +'7833' => "\x{0059}\x{030A}", +'7834' => "\x{0041}\x{02BE}", +'8016' => "\x{03A5}\x{0313}", +'8018' => "\x{03A5}\x{0313}\x{0300}", +'8020' => "\x{03A5}\x{0313}\x{0301}", +'8022' => "\x{03A5}\x{0313}\x{0342}", +'8064' => "\x{1F08}\x{0399}", +'8065' => "\x{1F09}\x{0399}", +'8066' => "\x{1F0A}\x{0399}", +'8067' => "\x{1F0B}\x{0399}", +'8068' => "\x{1F0C}\x{0399}", +'8069' => "\x{1F0D}\x{0399}", +'8070' => "\x{1F0E}\x{0399}", +'8071' => "\x{1F0F}\x{0399}", +'8072' => "\x{1F08}\x{0399}", +'8073' => "\x{1F09}\x{0399}", +'8074' => "\x{1F0A}\x{0399}", +'8075' => "\x{1F0B}\x{0399}", +'8076' => "\x{1F0C}\x{0399}", +'8077' => "\x{1F0D}\x{0399}", +'8078' => "\x{1F0E}\x{0399}", +'8079' => "\x{1F0F}\x{0399}", +'8080' => "\x{1F28}\x{0399}", +'8081' => "\x{1F29}\x{0399}", +'8082' => "\x{1F2A}\x{0399}", +'8083' => "\x{1F2B}\x{0399}", +'8084' => "\x{1F2C}\x{0399}", +'8085' => "\x{1F2D}\x{0399}", +'8086' => "\x{1F2E}\x{0399}", +'8087' => "\x{1F2F}\x{0399}", +'8088' => "\x{1F28}\x{0399}", +'8089' => "\x{1F29}\x{0399}", +'8090' => "\x{1F2A}\x{0399}", +'8091' => "\x{1F2B}\x{0399}", +'8092' => "\x{1F2C}\x{0399}", +'8093' => "\x{1F2D}\x{0399}", +'8094' => "\x{1F2E}\x{0399}", +'8095' => "\x{1F2F}\x{0399}", +'8096' => "\x{1F68}\x{0399}", +'8097' => "\x{1F69}\x{0399}", +'8098' => "\x{1F6A}\x{0399}", +'8099' => "\x{1F6B}\x{0399}", +'8100' => "\x{1F6C}\x{0399}", +'8101' => "\x{1F6D}\x{0399}", +'8102' => "\x{1F6E}\x{0399}", +'8103' => "\x{1F6F}\x{0399}", +'8104' => "\x{1F68}\x{0399}", +'8105' => "\x{1F69}\x{0399}", +'8106' => "\x{1F6A}\x{0399}", +'8107' => "\x{1F6B}\x{0399}", +'8108' => "\x{1F6C}\x{0399}", +'8109' => "\x{1F6D}\x{0399}", +'8110' => "\x{1F6E}\x{0399}", +'8111' => "\x{1F6F}\x{0399}", +'8114' => "\x{1FBA}\x{0399}", +'8115' => "\x{0391}\x{0399}", +'8116' => "\x{0386}\x{0399}", +'8118' => "\x{0391}\x{0342}", +'8119' => "\x{0391}\x{0342}\x{0399}", +'8124' => "\x{0391}\x{0399}", +'8130' => "\x{1FCA}\x{0399}", +'8131' => "\x{0397}\x{0399}", +'8132' => "\x{0389}\x{0399}", +'8134' => "\x{0397}\x{0342}", +'8135' => "\x{0397}\x{0342}\x{0399}", +'8140' => "\x{0397}\x{0399}", +'8146' => "\x{0399}\x{0308}\x{0300}", +'8147' => "\x{0399}\x{0308}\x{0301}", +'8150' => "\x{0399}\x{0342}", +'8151' => "\x{0399}\x{0308}\x{0342}", +'8162' => "\x{03A5}\x{0308}\x{0300}", +'8163' => "\x{03A5}\x{0308}\x{0301}", +'8164' => "\x{03A1}\x{0313}", +'8166' => "\x{03A5}\x{0342}", +'8167' => "\x{03A5}\x{0308}\x{0342}", +'8178' => "\x{1FFA}\x{0399}", +'8179' => "\x{03A9}\x{0399}", +'8180' => "\x{038F}\x{0399}", +'8182' => "\x{03A9}\x{0342}", +'8183' => "\x{03A9}\x{0342}\x{0399}", +'8188' => "\x{03A9}\x{0399}", +'64256' => "\x{0046}\x{0046}", +'64257' => "\x{0046}\x{0049}", +'64258' => "\x{0046}\x{004C}", +'64259' => "\x{0046}\x{0046}\x{0049}", +'64260' => "\x{0046}\x{0046}\x{004C}", +'64261' => "\x{0053}\x{0054}", +'64262' => "\x{0053}\x{0054}", +'64275' => "\x{0544}\x{0546}", +'64276' => "\x{0544}\x{0535}", +'64277' => "\x{0544}\x{053B}", +'64278' => "\x{054E}\x{0546}", +'64279' => "\x{0544}\x{053D}", +); + return <<'END'; 0061 0041 0062 0042 diff --git a/lib/unicore/mktables b/lib/unicore/mktables index 3328f69..7d8912d 100644 --- a/lib/unicore/mktables +++ b/lib/unicore/mktables @@ -745,16 +745,32 @@ if (open(my $SpecCase, "SpecCase.txt")) { } # Now write out the special cases properties in their code point order. -# The To/Spec{Lower,Title,Upper}.pl are unused for now since the swash -# routines do not do returning multiple characters. +# Prepend them to the To/{Upper,Lower,Title}.pl. for my $case (qw(Lower Title Upper)) { - my @case; - for my $prop (sort { $a->[0] <=> $b->[0] } @{$Case{$case}}) { - my ($ix, $code, $to) = @$prop; - append(\@case, $code, $to); + my $NormalCase = do "To/$case.pl"; + if (open(my $Case, ">To/$case.pl")) { + header($Case); + print $Case <[0] <=> $b->[0] } @{$Case{$case}}) { + my ($ix, $code, $to) = @$prop; + my $tostr = + join "", map { sprintf "\\x{%s}", $_ } split ' ', $to; + print $Case qq['$ix' => "$tostr",\n]; + } + print $Case < work. Returns an lowercased version of EXPR. This is the internal function implementing the C<\L> escape in double-quoted strings. Respects current LC_CTYPE locale if C in force. See L -and L. +and L for more details about locale and Unicode support. If EXPR is omitted, uses C<$_>. @@ -2341,7 +2341,8 @@ If EXPR is omitted, uses C<$_>. Returns the value of EXPR with the first character lowercased. This is the internal function implementing the C<\l> escape in double-quoted strings. Respects current LC_CTYPE locale if C in force. See L and L. +locale> in force. See L and L for more +details about locale and Unicode support. If EXPR is omitted, uses C<$_>. @@ -5464,8 +5465,9 @@ otherwise. Returns an uppercased version of EXPR. This is the internal function implementing the C<\U> escape in double-quoted strings. Respects current LC_CTYPE locale if C in force. See L -and L. It does not attempt to do titlecase mapping on -initial letters. See C for that. +and L for more details about locale and Unicode support. +It does not attempt to do titlecase mapping on initial letters. See +C for that. If EXPR is omitted, uses C<$_>. @@ -5476,7 +5478,8 @@ If EXPR is omitted, uses C<$_>. Returns the value of EXPR with the first character in uppercase (titlecase in Unicode). This is the internal function implementing the C<\u> escape in double-quoted strings. Respects current LC_CTYPE -locale if C in force. See L and L. +locale if C in force. See L and L +for more details about locale and Unicode support. If EXPR is omitted, uses C<$_>. diff --git a/pod/perlunicode.pod b/pod/perlunicode.pod index 4e7c936..9b4d2e3 100644 --- a/pod/perlunicode.pod +++ b/pod/perlunicode.pod @@ -552,15 +552,37 @@ wide bit complement. =item * -lc(), uc(), lcfirst(), and ucfirst() work only for some of the -simplest cases, where the mapping goes from a single Unicode character -to another single Unicode character, and where the mapping does not -depend on surrounding characters, or on locales. More complex cases, -where for example one character maps into several, are not yet -implemented. See the Unicode Technical Report #21, Case Mappings, -for more details. The Unicode::UCD module (part of Perl since 5.8.0) -casespec() and casefold() interfaces supply information about the more -complex cases. +lc(), uc(), lcfirst(), and ucfirst() work for the following cases: + +=over 8 + +=item * + +the case mapping is from a single Unicode character to another +single Unicode character + +=item * + +the case mapping is from a single Unicode character to more +than one Unicode character + +=back + +What doesn't yet work are the followng cases: + +=over 8 + +=item * + +the "final sigma" (Greek) + +=item * + +anything to with locales (Lithuanian, Turkish, Azeri) + +=back + +See the Unicode Technical Report #21, Case Mappings, for more details. =item * diff --git a/proto.h b/proto.h index ff3ac5f..b56817a 100644 --- a/proto.h +++ b/proto.h @@ -797,6 +797,7 @@ PERL_CALLCONV SV* Perl_swash_init(pTHX_ char* pkg, char* name, SV* listsv, I32 m PERL_CALLCONV UV Perl_swash_fetch(pTHX_ SV *sv, U8 *ptr, bool do_utf8); PERL_CALLCONV void Perl_taint_env(pTHX); PERL_CALLCONV void Perl_taint_proper(pTHX_ const char* f, const char* s); +PERL_CALLCONV UV Perl_to_utf8_case(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp, SV **swash, char *normal, char *special); PERL_CALLCONV UV Perl_to_utf8_lower(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp); PERL_CALLCONV UV Perl_to_utf8_upper(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp); PERL_CALLCONV UV Perl_to_utf8_title(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp); diff --git a/t/op/lc.t b/t/op/lc.t index 2db3a8a..9333c6c 100644 --- a/t/op/lc.t +++ b/t/op/lc.t @@ -1,59 +1,93 @@ #!./perl -print "1..40\n"; +print "1..42\n"; + +my $test = 1; + +sub ok { + if ($_[0]) { + if ($_[1]) { + print "ok $test - $_[1]\n"; + } else { + print "ok $test\n"; + } + } else { + if ($_[1]) { + print "not ok $test - $_[1]\n"; + } else { + print "not ok $test\n"; + } + } + $test++; +} $a = "HELLO.* world"; $b = "hello.* WORLD"; -print "ok 1\n" if "\Q$a\E." eq "HELLO\\.\\*\\ world."; -print "ok 2\n" if "\u$a" eq "HELLO\.\* world"; -print "ok 3\n" if "\l$a" eq "hELLO\.\* world"; -print "ok 4\n" if "\U$a" eq "HELLO\.\* WORLD"; -print "ok 5\n" if "\L$a" eq "hello\.\* world"; - -print "ok 6\n" if quotemeta($a) eq "HELLO\\.\\*\\ world"; -print "ok 7\n" if ucfirst($a) eq "HELLO\.\* world"; -print "ok 8\n" if lcfirst($a) eq "hELLO\.\* world"; -print "ok 9\n" if uc($a) eq "HELLO\.\* WORLD"; -print "ok 10\n" if lc($a) eq "hello\.\* world"; - -print "ok 11\n" if "\Q$b\E." eq "hello\\.\\*\\ WORLD."; -print "ok 12\n" if "\u$b" eq "Hello\.\* WORLD"; -print "ok 13\n" if "\l$b" eq "hello\.\* WORLD"; -print "ok 14\n" if "\U$b" eq "HELLO\.\* WORLD"; -print "ok 15\n" if "\L$b" eq "hello\.\* world"; - -print "ok 16\n" if quotemeta($b) eq "hello\\.\\*\\ WORLD"; -print "ok 17\n" if ucfirst($b) eq "Hello\.\* WORLD"; -print "ok 18\n" if lcfirst($b) eq "hello\.\* WORLD"; -print "ok 19\n" if uc($b) eq "HELLO\.\* WORLD"; -print "ok 20\n" if lc($b) eq "hello\.\* world"; +ok("\Q$a\E." eq "HELLO\\.\\*\\ world.", '\Q\E HELLO.* world'); +ok("\u$a" eq "HELLO\.\* world", '\u'); +ok("\l$a" eq "hELLO\.\* world", '\l'); +ok("\U$a" eq "HELLO\.\* WORLD", '\U'); +ok("\L$a" eq "hello\.\* world", '\L'); + +ok(quotemeta($a) eq "HELLO\\.\\*\\ world", 'quotemeta'); +ok(ucfirst($a) eq "HELLO\.\* world", 'ucfirst'); +ok(lcfirst($a) eq "hELLO\.\* world", 'lcfirst'); +ok(uc($a) eq "HELLO\.\* WORLD", 'uc'); +ok(lc($a) eq "hello\.\* world", 'lc'); + +ok("\Q$b\E." eq "hello\\.\\*\\ WORLD.", '\Q\E hello.* WORLD'); +ok("\u$b" eq "Hello\.\* WORLD", '\u'); +ok("\l$b" eq "hello\.\* WORLD", '\l'); +ok("\U$b" eq "HELLO\.\* WORLD", '\U'); +ok("\L$b" eq "hello\.\* world", '\L'); + +ok(quotemeta($b) eq "hello\\.\\*\\ WORLD", 'quotemeta'); +ok(ucfirst($b) eq "Hello\.\* WORLD", 'ucfirst'); +ok(lcfirst($b) eq "hello\.\* WORLD", 'lcfirst'); +ok(uc($b) eq "HELLO\.\* WORLD", 'uc'); +ok(lc($b) eq "hello\.\* world", 'lc'); + +# \x{100} is LATIN CAPITAL LETTER A WITH MACRON; its bijective lowercase is +# \x{100}, LATIN SMALL LETTER A WITH MACRON. $a = "\x{100}\x{101}\x{41}\x{61}"; $b = "\x{101}\x{100}\x{61}\x{41}"; -print "ok 21\n" if "\Q$a\E." eq "\x{100}\x{101}\x{41}\x{61}."; -print "ok 22\n" if "\u$a" eq "\x{100}\x{101}\x{41}\x{61}"; -print "ok 23\n" if "\l$a" eq "\x{101}\x{101}\x{41}\x{61}"; -print "ok 24\n" if "\U$a" eq "\x{100}\x{100}\x{41}\x{41}"; -print "ok 25\n" if "\L$a" eq "\x{101}\x{101}\x{61}\x{61}"; - -print "ok 26\n" if quotemeta($a) eq "\x{100}\x{101}\x{41}\x{61}"; -print "ok 27\n" if ucfirst($a) eq "\x{100}\x{101}\x{41}\x{61}"; -print "ok 28\n" if lcfirst($a) eq "\x{101}\x{101}\x{41}\x{61}"; -print "ok 29\n" if uc($a) eq "\x{100}\x{100}\x{41}\x{41}"; -print "ok 30\n" if lc($a) eq "\x{101}\x{101}\x{61}\x{61}"; - -print "ok 31\n" if "\Q$b\E." eq "\x{101}\x{100}\x{61}\x{41}."; -print "ok 32\n" if "\u$b" eq "\x{100}\x{100}\x{61}\x{41}"; -print "ok 33\n" if "\l$b" eq "\x{101}\x{100}\x{61}\x{41}"; -print "ok 34\n" if "\U$b" eq "\x{100}\x{100}\x{41}\x{41}"; -print "ok 35\n" if "\L$b" eq "\x{101}\x{101}\x{61}\x{61}"; - -print "ok 36\n" if quotemeta($b) eq "\x{101}\x{100}\x{61}\x{41}"; -print "ok 37\n" if ucfirst($b) eq "\x{100}\x{100}\x{61}\x{41}"; -print "ok 38\n" if lcfirst($b) eq "\x{101}\x{100}\x{61}\x{41}"; -print "ok 39\n" if uc($b) eq "\x{100}\x{100}\x{41}\x{41}"; -print "ok 40\n" if lc($b) eq "\x{101}\x{101}\x{61}\x{61}"; +ok("\Q$a\E." eq "\x{100}\x{101}\x{41}\x{61}.", '\Q\E \x{100}\x{101}\x{41}\x{61}'); +ok("\u$a" eq "\x{100}\x{101}\x{41}\x{61}", '\u'); +ok("\l$a" eq "\x{101}\x{101}\x{41}\x{61}", '\l'); +ok("\U$a" eq "\x{100}\x{100}\x{41}\x{41}", '\U'); +ok("\L$a" eq "\x{101}\x{101}\x{61}\x{61}", '\L'); + +ok(quotemeta($a) eq "\x{100}\x{101}\x{41}\x{61}", 'quotemeta'); +ok(ucfirst($a) eq "\x{100}\x{101}\x{41}\x{61}", 'ucfirst'); +ok(lcfirst($a) eq "\x{101}\x{101}\x{41}\x{61}", 'lcfirst'); +ok(uc($a) eq "\x{100}\x{100}\x{41}\x{41}", 'uc'); +ok(lc($a) eq "\x{101}\x{101}\x{61}\x{61}", 'lc'); + +ok("\Q$b\E." eq "\x{101}\x{100}\x{61}\x{41}.", '\Q\E \x{101}\x{100}\x{61}\x{41}'); +ok("\u$b" eq "\x{100}\x{100}\x{61}\x{41}", '\u'); +ok("\l$b" eq "\x{101}\x{100}\x{61}\x{41}", '\l'); +ok("\U$b" eq "\x{100}\x{100}\x{41}\x{41}", '\U'); +ok("\L$b" eq "\x{101}\x{101}\x{61}\x{61}", '\L'); + +ok(quotemeta($b) eq "\x{101}\x{100}\x{61}\x{41}", 'quotemeta'); +ok(ucfirst($b) eq "\x{100}\x{100}\x{61}\x{41}", 'ucfirst'); +ok(lcfirst($b) eq "\x{101}\x{100}\x{61}\x{41}", 'lcfirst'); +ok(uc($b) eq "\x{100}\x{100}\x{41}\x{41}", 'uc'); +ok(lc($b) eq "\x{101}\x{101}\x{61}\x{61}", 'lc'); + +# \x{DF} is LATIN SMALL LETTER SHARP S, its uppercase is SS or \x{53}\x{53}; +# \x{149} is LATIN SMALL LETTER N PRECEDED BY APOSTROPHE, its uppercase is +# \x{2BC}\x{E4} or MODIFIER LETTER APOSTROPHE and N. + +ok("\U\x{DF}ab\x{149}cd" eq "\x{53}\x{53}AB\x{2BC}\x{4E}CD", + "multicharacter uppercase"); + +# The \x{DF} is its own lowercase, ditto for \x{149}. +# There are no single character -> multiple characters lowercase mappings. +ok("\L\x{DF}AB\x{149}CD" eq "\x{DF}ab\x{149}cd", + "multicharacter lowercase"); diff --git a/utf8.c b/utf8.c index 0c09469..4a3fe1d 100644 --- a/utf8.c +++ b/utf8.c @@ -1181,45 +1181,63 @@ Perl_is_utf8_mark(pTHX_ U8 *p) } UV -Perl_to_utf8_upper(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp) +Perl_to_utf8_case(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp, SV **swashp,char *normal, char *special) { UV uv; - if (!PL_utf8_toupper) - PL_utf8_toupper = swash_init("utf8", "ToUpper", &PL_sv_undef, 4, 0); - uv = swash_fetch(PL_utf8_toupper, p, TRUE); - uv = uv ? UNI_TO_NATIVE(uv) : utf8_to_uvchr(p, 0); + if (!*swashp) + *swashp = swash_init("utf8", normal, &PL_sv_undef, 4, 0); + uv = swash_fetch(*swashp, p, TRUE); + if (uv) + uv = UNI_TO_NATIVE(uv); + else { + HV *hv; + SV *keysv; + HE *he; + + uv = utf8_to_uvchr(p, 0); + + if ((hv = get_hv(special, FALSE)) && + (keysv = sv_2mortal(Perl_newSVpvf(aTHX_ "%"UVuf, uv))) && + (he = hv_fetch_ent(hv, keysv, FALSE, 0))) { + SV *val = HeVAL(he); + char *s = SvPV(val, *lenp); + U8 c = *(U8*)s; + if (*lenp > 1 || UNI_IS_INVARIANT(c)) + Copy(s, ustrp, *lenp, U8); + else { + /* something in the 0x80..0xFF range */ + ustrp[0] = UTF8_EIGHT_BIT_HI(c); + ustrp[1] = UTF8_EIGHT_BIT_LO(c); + *lenp = 2; + } + return 0; + } + } *lenp = UNISKIP(uv); uvuni_to_utf8(ustrp, uv); return uv; } UV -Perl_to_utf8_title(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp) +Perl_to_utf8_upper(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp) { - UV uv; + return Perl_to_utf8_case(aTHX_ p, ustrp, lenp, + &PL_utf8_toupper, "ToUpper", "utf8::ToSpecUpper"); +} - if (!PL_utf8_totitle) - PL_utf8_totitle = swash_init("utf8", "ToTitle", &PL_sv_undef, 4, 0); - uv = swash_fetch(PL_utf8_totitle, p, TRUE); - uv = uv ? UNI_TO_NATIVE(uv) : utf8_to_uvchr(p, 0); - *lenp = UNISKIP(uv); - uvuni_to_utf8(ustrp, uv); - return uv; +UV +Perl_to_utf8_title(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp) +{ + return Perl_to_utf8_case(aTHX_ p, ustrp, lenp, + &PL_utf8_totitle, "ToTitle", "utf8::ToSpecTitle"); } UV Perl_to_utf8_lower(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp) { - UV uv; - - if (!PL_utf8_tolower) - PL_utf8_tolower = swash_init("utf8", "ToLower", &PL_sv_undef, 4, 0); - uv = swash_fetch(PL_utf8_tolower, p, TRUE); - uv = uv ? UNI_TO_NATIVE(uv) : utf8_to_uvchr(p, 0); - *lenp = UNISKIP(uv); - uvuni_to_utf8(ustrp, uv); - return uv; + return Perl_to_utf8_case(aTHX_ p, ustrp, lenp, + &PL_utf8_tolower, "ToLower", "utf8::ToSpecLower"); } /* a "swash" is a swatch hash */ -- 2.7.4