From 3bec356411ea30186c2876cb56cbf5d69845ed32 Mon Sep 17 00:00:00 2001 From: Jarkko Hietaniemi Date: Fri, 1 Dec 2000 06:45:53 +0000 Subject: [PATCH] Get the three different space character classes right under utf8. p4raw-id: //depot/perl@7940 --- MANIFEST | 2 ++ lib/unicode/Is/Blank.pl | 12 ++++++++++++ lib/unicode/Is/SpacePerl.pl | 14 ++++++++++++++ lib/unicode/mktables.PL | 5 +++++ pod/perlre.pod | 6 ++++-- regcomp.c | 15 ++++++++------- regexec.c | 2 +- t/op/pat.t | 6 +++--- utf8.c | 2 +- 9 files changed, 50 insertions(+), 14 deletions(-) create mode 100644 lib/unicode/Is/Blank.pl create mode 100644 lib/unicode/Is/SpacePerl.pl diff --git a/MANIFEST b/MANIFEST index 4607250..7da209e 100644 --- a/MANIFEST +++ b/MANIFEST @@ -922,6 +922,7 @@ lib/unicode/Is/BidiRLE.pl Unicode character database lib/unicode/Is/BidiRLO.pl Unicode character database lib/unicode/Is/BidiS.pl Unicode character database lib/unicode/Is/BidiWS.pl Unicode character database +lib/unicode/Is/Blank.pl Unicode character database lib/unicode/Is/C.pl Unicode character database lib/unicode/Is/Cc.pl Unicode character database lib/unicode/Is/Cf.pl Unicode character database @@ -1010,6 +1011,7 @@ lib/unicode/Is/Sk.pl Unicode character database lib/unicode/Is/Sm.pl Unicode character database lib/unicode/Is/So.pl Unicode character database lib/unicode/Is/Space.pl Unicode character database +lib/unicode/Is/SpacePerl.pl Unicode character database lib/unicode/Is/SylA.pl Unicode character database lib/unicode/Is/SylAA.pl Unicode character database lib/unicode/Is/SylAAI.pl Unicode character database diff --git a/lib/unicode/Is/Blank.pl b/lib/unicode/Is/Blank.pl new file mode 100644 index 0000000..8642921 --- /dev/null +++ b/lib/unicode/Is/Blank.pl @@ -0,0 +1,12 @@ +# !!!!!!! DO NOT EDIT THIS FILE !!!!!!! +# This file is built by mktables.PL from e.g. Unicode.301. +# Any changes made here will be lost! +return <<'END'; +0009 +0020 +00a0 +1680 +2000 200b +202f +3000 +END diff --git a/lib/unicode/Is/SpacePerl.pl b/lib/unicode/Is/SpacePerl.pl new file mode 100644 index 0000000..2bb74de --- /dev/null +++ b/lib/unicode/Is/SpacePerl.pl @@ -0,0 +1,14 @@ +# !!!!!!! DO NOT EDIT THIS FILE !!!!!!! +# This file is built by mktables.PL from e.g. Unicode.301. +# Any changes made here will be lost! +return <<'END'; +0009 000a +000c 000d +0020 +00a0 +1680 +2000 200b +2028 2029 +202f +3000 +END diff --git a/lib/unicode/mktables.PL b/lib/unicode/mktables.PL index d8b57b6..82b35ef 100755 --- a/lib/unicode/mktables.PL +++ b/lib/unicode/mktables.PL @@ -25,8 +25,13 @@ mkdir "To", 0755; # 000B: VERTICAL TABULATION # 000C: FORM FEED # 000D: CARRIAGE RETURN + # 0020: SPACE ['IsSpace', '$cat =~ /^Z/ || $code =~ /^(0009|000A|000B|000C|000D)$/', ''], + ['IsSpacePerl', + '$cat =~ /^Z/ || + $code =~ /^(0009|000A|000C|000D)$/', ''], + ['IsBlank', '$cat =~ /^Z[^lp]$/ || $code eq "0009"', ''], ['IsDigit', '$cat =~ /^Nd$/', ''], ['IsUpper', '$cat =~ /^L[ut]$/', ''], ['IsLower', '$cat =~ /^Ll$/', ''], diff --git a/pod/perlre.pod b/pod/perlre.pod index 182f5bd..c5ecb13 100644 --- a/pod/perlre.pod +++ b/pod/perlre.pod @@ -225,19 +225,21 @@ whole character class. For example: matches zero, one, any alphabetic character, and the percentage sign. If the C pragma is used, the following equivalences to Unicode -\p{} constructs hold: +\p{} constructs and equivalent backslash character classes (if available), +will hold: alpha IsAlpha alnum IsAlnum ascii IsASCII blank IsSpace cntrl IsCntrl - digit IsDigit + digit IsDigit \d graph IsGraph lower IsLower print IsPrint punct IsPunct space IsSpace + IsSpacePerl \s upper IsUpper word IsWord xdigit IsXDigit diff --git a/regcomp.c b/regcomp.c index 784e83e..3b4f481 100644 --- a/regcomp.c +++ b/regcomp.c @@ -3705,7 +3705,7 @@ S_regclassutf8(pTHX_ RExC_state_t *pRExC_state) flags |= ANYOF_FOLD; if (LOC) flags |= ANYOF_LOCALE; - listsv = newSVpvn("# comment\n",10); + listsv = newSVpvn("# comment\n", 10); } if (!SIZE_ONLY && ckWARN(WARN_REGEXP)) @@ -3868,15 +3868,16 @@ S_regclassutf8(pTHX_ RExC_state_t *pRExC_state) case ANYOF_NPUNCT: Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPunct\n"); break; case ANYOF_SPACE: - case ANYOF_PSXSPC: + Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpacePerl\n");break; + case ANYOF_NSPACE: + Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpacePerl\n");break; case ANYOF_BLANK: - /* Not very true for PSXSPC and BLANK - * but not feeling like creating IsPOSIXSpace and - * IsBlank right now. --jhi */ + Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsBlank\n"); break; + case ANYOF_NBLANK: + Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsBlank\n"); break; + case ANYOF_PSXSPC: Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpace\n"); break; - case ANYOF_NSPACE: case ANYOF_NPSXSPC: - case ANYOF_NBLANK: Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpace\n"); break; case ANYOF_UPPER: Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsUpper\n"); break; diff --git a/regexec.c b/regexec.c index 018c6c8..18c06d5 100644 --- a/regexec.c +++ b/regexec.c @@ -3773,7 +3773,7 @@ S_reginclass(pTHX_ register regnode *p, register I32 c) (ANYOF_CLASS_TEST(p, ANYOF_NXDIGIT) && !isXDIGIT(c)) || (ANYOF_CLASS_TEST(p, ANYOF_PSXSPC) && isPSXSPC(c)) || (ANYOF_CLASS_TEST(p, ANYOF_NPSXSPC) && !isPSXSPC(c)) || - (ANYOF_CLASS_TEST(p, ANYOF_BLANK) && isBLANK(c)) || + (ANYOF_CLASS_TEST(p, ANYOF_BLANK) && isBLANK(c)) || (ANYOF_CLASS_TEST(p, ANYOF_NBLANK) && !isBLANK(c)) ) /* How's that for a conditional? */ { diff --git a/t/op/pat.t b/t/op/pat.t index 8c3638c..aaec39d 100755 --- a/t/op/pat.t +++ b/t/op/pat.t @@ -1107,15 +1107,15 @@ my @space1 = sort grep { $space{$_} =~ /[[:space:]]/ } keys %space; my @space2 = sort grep { $space{$_} =~ /[[:blank:]]/ } keys %space; print "not " unless "@space0" eq "cr ff lf spc tab"; -print "ok $test\n"; +print "ok $test # @space0\n"; $test++; print "not " unless "@space1" eq "cr ff lf spc tab vt"; -print "ok $test\n"; +print "ok $test # @space1\n"; $test++; print "not " unless "@space2" eq "spc tab"; -print "ok $test\n"; +print "ok $test # @space2\n"; $test++; # bugid 20001021.005 - this caused a SEGV diff --git a/utf8.c b/utf8.c index e313258..9e943ac 100644 --- a/utf8.c +++ b/utf8.c @@ -899,7 +899,7 @@ Perl_is_utf8_space(pTHX_ U8 *p) if (!is_utf8_char(p)) return FALSE; if (!PL_utf8_space) - PL_utf8_space = swash_init("utf8", "IsSpace", &PL_sv_undef, 0, 0); + PL_utf8_space = swash_init("utf8", "IsSpacePerl", &PL_sv_undef, 0, 0); return swash_fetch(PL_utf8_space, p); } -- 2.7.4