From eba68aa09a0b159ee4eef3cee1bd58ee95fdb81a Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Sun, 23 Dec 2012 13:49:02 -0700 Subject: [PATCH] handy.h: Add full complement of isIDCONT() macros This also changes isIDCONT_utf8() to use the Perl definition, which excludes any \W characters (the Unicode definition includes a few of these). Tests are also added. These macros remain undocumented for now. --- embed.fnc | 2 ++ embed.h | 2 ++ embedvar.h | 1 + ext/XS-APItest/APItest.pm | 2 +- ext/XS-APItest/APItest.xs | 49 +++++++++++++++++++++++++++++++++++++++++++++++ ext/XS-APItest/t/handy.t | 7 ++++--- handy.h | 15 ++++++++++++--- intrpvar.h | 1 + proto.h | 9 +++++++++ sv.c | 1 + utf8.c | 19 ++++++++++++++++++ 11 files changed, 101 insertions(+), 7 deletions(-) diff --git a/embed.fnc b/embed.fnc index 0ad711f..61934fb 100644 --- a/embed.fnc +++ b/embed.fnc @@ -637,6 +637,7 @@ AMp |UV |_to_uni_fold_flags|UV c|NN U8 *p|NN STRLEN *lenp|const U8 flags ADMpPR |bool |is_uni_alnum_lc|UV c ADMpPR |bool |is_uni_alnumc_lc|UV c ADMpPR |bool |is_uni_idfirst_lc|UV c +AMpR |bool |_is_uni_perl_idcont|UV c AMpR |bool |_is_uni_perl_idstart|UV c ADMpPR |bool |is_uni_alpha_lc|UV c ADMpPR |bool |is_uni_ascii_lc|UV c @@ -662,6 +663,7 @@ ADMpR |bool |is_utf8_alnum |NN const U8 *p ADMpR |bool |is_utf8_alnumc |NN const U8 *p ADMpR |bool |is_utf8_idfirst|NN const U8 *p ADMpR |bool |is_utf8_xidfirst|NN const U8 *p +AMpR |bool |_is_utf8_perl_idcont|NN const U8 *p AMpR |bool |_is_utf8_perl_idstart|NN const U8 *p ADMpR |bool |is_utf8_idcont |NN const U8 *p ADMpR |bool |is_utf8_xidcont |NN const U8 *p diff --git a/embed.h b/embed.h index 89dca3c..2aae592 100644 --- a/embed.h +++ b/embed.h @@ -28,9 +28,11 @@ #define Gv_AMupdate(a,b) Perl_Gv_AMupdate(aTHX_ a,b) #define _is_uni_FOO(a,b) Perl__is_uni_FOO(aTHX_ a,b) +#define _is_uni_perl_idcont(a) Perl__is_uni_perl_idcont(aTHX_ a) #define _is_uni_perl_idstart(a) Perl__is_uni_perl_idstart(aTHX_ a) #define _is_utf8_FOO(a,b) Perl__is_utf8_FOO(aTHX_ a,b) #define _is_utf8_mark(a) Perl__is_utf8_mark(aTHX_ a) +#define _is_utf8_perl_idcont(a) Perl__is_utf8_perl_idcont(aTHX_ a) #define _is_utf8_perl_idstart(a) Perl__is_utf8_perl_idstart(aTHX_ a) #define _to_uni_fold_flags(a,b,c,d) Perl__to_uni_fold_flags(aTHX_ a,b,c,d) #define _to_utf8_fold_flags(a,b,c,d,e) Perl__to_utf8_fold_flags(aTHX_ a,b,c,d,e) diff --git a/embedvar.h b/embedvar.h index 6b1775c..2964e62 100644 --- a/embedvar.h +++ b/embedvar.h @@ -334,6 +334,7 @@ #define PL_utf8_idcont (vTHX->Iutf8_idcont) #define PL_utf8_idstart (vTHX->Iutf8_idstart) #define PL_utf8_mark (vTHX->Iutf8_mark) +#define PL_utf8_perl_idcont (vTHX->Iutf8_perl_idcont) #define PL_utf8_perl_idstart (vTHX->Iutf8_perl_idstart) #define PL_utf8_swash_ptrs (vTHX->Iutf8_swash_ptrs) #define PL_utf8_tofold (vTHX->Iutf8_tofold) diff --git a/ext/XS-APItest/APItest.pm b/ext/XS-APItest/APItest.pm index 831b6f3..3d7449b 100644 --- a/ext/XS-APItest/APItest.pm +++ b/ext/XS-APItest/APItest.pm @@ -5,7 +5,7 @@ use strict; use warnings; use Carp; -our $VERSION = '0.49'; +our $VERSION = '0.50'; require XSLoader; diff --git a/ext/XS-APItest/APItest.xs b/ext/XS-APItest/APItest.xs index 2155bc9..b23232c 100644 --- a/ext/XS-APItest/APItest.xs +++ b/ext/XS-APItest/APItest.xs @@ -3998,6 +3998,55 @@ test_isIDFIRST_LC_utf8(unsigned char * p) RETVAL bool +test_isIDCONT_uni(UV ord) + CODE: + RETVAL = isIDCONT_uni(ord); + OUTPUT: + RETVAL + +bool +test_isIDCONT_LC_uvchr(UV ord) + CODE: + RETVAL = isIDCONT_LC_uvchr(ord); + OUTPUT: + RETVAL + +bool +test_isIDCONT_A(UV ord) + CODE: + RETVAL = isIDCONT_A(ord); + OUTPUT: + RETVAL + +bool +test_isIDCONT_L1(UV ord) + CODE: + RETVAL = isIDCONT_L1(ord); + OUTPUT: + RETVAL + +bool +test_isIDCONT_LC(UV ord) + CODE: + RETVAL = isIDCONT_LC(ord); + OUTPUT: + RETVAL + +bool +test_isIDCONT_utf8(unsigned char * p) + CODE: + RETVAL = isIDCONT_utf8( p); + OUTPUT: + RETVAL + +bool +test_isIDCONT_LC_utf8(unsigned char * p) + CODE: + RETVAL = isIDCONT_LC_utf8( p); + OUTPUT: + RETVAL + +bool test_isSPACE_uni(UV ord) CODE: RETVAL = isSPACE_uni(ord); diff --git a/ext/XS-APItest/t/handy.t b/ext/XS-APItest/t/handy.t index ffa2f17..0730c10 100644 --- a/ext/XS-APItest/t/handy.t +++ b/ext/XS-APItest/t/handy.t @@ -38,6 +38,7 @@ my %properties = ( digit => 'Digit', graph => 'Graph', idfirst => '_Perl_IDStart', + idcont => '_Perl_IDCont', lower => 'Lower', print => 'Print', psxspc => 'XPosixSpace', @@ -73,9 +74,9 @@ foreach my $name (sort keys %properties) { last if $above_latins > 5; } - # This makes sure we are using the Perl definition of idfirst, and not the - # Unicode. There are a few differences. - push @code_points, ord "\N{ESTIMATED SYMBOL}" if $name eq 'idfirst'; + # This makes sure we are using the Perl definition of idfirst and idcont, + # and not the Unicode. There are a few differences. + push @code_points, ord "\N{ESTIMATED SYMBOL}" if $name =~ /^id(first|cont)/; # And finally one non-Unicode code point. push @code_points, 0x110000; # Above Unicode, no prop should match diff --git a/handy.h b/handy.h index 894b209..eaa00a4 100644 --- a/handy.h +++ b/handy.h @@ -679,8 +679,8 @@ character set, if possible; otherwise returns the input character itself. =cut -Still undocumented are , PSXSPC, VERTSPACE, and IDFIRST, and the other -toUPPER etc functions +XXX Still undocumented are PSXSPC, VERTSPACE, and IDFIRST IDCONT, and the +other toUPPER etc functions Note that these macros are repeated in Devel::PPPort, so should also be patched there. The file as of this writing is cpan/Devel-PPPort/parts/inc/misc @@ -1109,6 +1109,10 @@ EXTCONST U32 PL_charclass[]; # endif #endif /* USE_NEXT_CTYPE */ +#define isIDCONT(c) isWORDCHAR(c) +#define isIDCONT_A(c) isWORDCHAR_A(c) +#define isIDCONT_L1(c) isWORDCHAR_L1(c) +#define isIDCONT_LC(c) isWORDCHAR_LC(c) #define isPSXSPC_LC(c) isSPACE_LC(c) /* For internal core Perl use only. If the input is Latin1, use the Latin1 @@ -1131,6 +1135,7 @@ EXTCONST U32 PL_charclass[]; #define isCNTRL_uni(c) isCNTRL_L1(c) /* All controls are in Latin1 */ #define isDIGIT_uni(c) _generic_swash_uni(_CC_DIGIT, c) #define isGRAPH_uni(c) _generic_swash_uni(_CC_GRAPH, c) +#define isIDCONT_uni(c) _generic_uni(_CC_WORDCHAR, _is_uni_perl_idcont, c) #define isIDFIRST_uni(c) _generic_uni(_CC_IDFIRST, _is_uni_perl_idstart, c) #define isLOWER_uni(c) _generic_swash_uni(_CC_LOWER, c) #define isPRINT_uni(c) _generic_swash_uni(_CC_PRINT, c) @@ -1163,6 +1168,8 @@ EXTCONST U32 PL_charclass[]; #define isCNTRL_LC_uvchr(c) (c < 256 ? isCNTRL_LC(c) : 0) #define isDIGIT_LC_uvchr(c) _generic_LC_swash_uvchr(isDIGIT_LC, _CC_DIGIT, c) #define isGRAPH_LC_uvchr(c) _generic_LC_swash_uvchr(isGRAPH_LC, _CC_GRAPH, c) +#define isIDCONT_LC_uvchr(c) _generic_LC_uvchr(isIDCONT_LC, \ + _is_uni_perl_idcont, c) #define isIDFIRST_LC_uvchr(c) _generic_LC_uvchr(isIDFIRST_LC, \ _is_uni_perl_idstart, c) #define isLOWER_LC_uvchr(c) _generic_LC_swash_uvchr(isLOWER_LC, _CC_LOWER, c) @@ -1234,7 +1241,8 @@ EXTCONST U32 PL_charclass[]; #define isDIGIT_utf8(p) _generic_utf8_no_upper_latin1(_CC_DIGIT, p, \ _is_utf8_FOO(_CC_DIGIT, p)) #define isGRAPH_utf8(p) _generic_swash_utf8(_CC_GRAPH, p) -#define isIDCONT_utf8(p) _generic_func_utf8(_CC_WORDCHAR, is_utf8_xidcont, p) +#define isIDCONT_utf8(p) _generic_func_utf8(_CC_WORDCHAR, \ + _is_utf8_perl_idstart, p) /* To prevent S_scan_word in toke.c from hanging, we have to make sure that * IDFIRST is an alnum. See @@ -1288,6 +1296,7 @@ EXTCONST U32 PL_charclass[]; #define isCNTRL_LC_utf8(p) _generic_LC_utf8(isCNTRL_LC, p, 0) #define isDIGIT_LC_utf8(p) _generic_LC_swash_utf8(isDIGIT_LC, _CC_DIGIT, p) #define isGRAPH_LC_utf8(p) _generic_LC_swash_utf8(isGRAPH_LC, _CC_GRAPH, p) +#define isIDCONT_LC_utf8(p) _generic_LC_func_utf8(isIDCONT_LC, _is_utf8_perl_idcont, p) #define isIDFIRST_LC_utf8(p) _generic_LC_func_utf8(isIDFIRST_LC, _is_utf8_perl_idstart, p) #define isLOWER_LC_utf8(p) _generic_LC_swash_utf8(isLOWER_LC, _CC_LOWER, p) #define isPRINT_LC_utf8(p) _generic_LC_swash_utf8(isPRINT_LC, _CC_PRINT, p) diff --git a/intrpvar.h b/intrpvar.h index 2cc1ff3..f7176b1 100644 --- a/intrpvar.h +++ b/intrpvar.h @@ -663,6 +663,7 @@ PERLVAR(I, utf8_idstart, SV *) PERLVAR(I, utf8_idcont, SV *) PERLVAR(I, utf8_xidstart, SV *) PERLVAR(I, utf8_perl_idstart, SV *) +PERLVAR(I, utf8_perl_idcont, SV *) PERLVAR(I, utf8_xidcont, SV *) PERLVAR(I, sort_RealCmp, SVCOMPARE_t) diff --git a/proto.h b/proto.h index d27d8b1..e8af3c8 100644 --- a/proto.h +++ b/proto.h @@ -35,6 +35,9 @@ PERL_CALLCONV void Perl_Slab_Free(pTHX_ void *op) PERL_CALLCONV bool Perl__is_uni_FOO(pTHX_ const U8 classnum, const UV c) __attribute__warn_unused_result__; +PERL_CALLCONV bool Perl__is_uni_perl_idcont(pTHX_ UV c) + __attribute__warn_unused_result__; + PERL_CALLCONV bool Perl__is_uni_perl_idstart(pTHX_ UV c) __attribute__warn_unused_result__; @@ -50,6 +53,12 @@ PERL_CALLCONV bool Perl__is_utf8_mark(pTHX_ const U8 *p) #define PERL_ARGS_ASSERT__IS_UTF8_MARK \ assert(p) +PERL_CALLCONV bool Perl__is_utf8_perl_idcont(pTHX_ const U8 *p) + __attribute__warn_unused_result__ + __attribute__nonnull__(pTHX_1); +#define PERL_ARGS_ASSERT__IS_UTF8_PERL_IDCONT \ + assert(p) + PERL_CALLCONV bool Perl__is_utf8_perl_idstart(pTHX_ const U8 *p) __attribute__warn_unused_result__ __attribute__nonnull__(pTHX_1); diff --git a/sv.c b/sv.c index e5bbb3d..9fc807a 100644 --- a/sv.c +++ b/sv.c @@ -13611,6 +13611,7 @@ perl_clone_using(PerlInterpreter *proto_perl, UV flags, PL_utf8_idstart = sv_dup_inc(proto_perl->Iutf8_idstart, param); PL_utf8_xidstart = sv_dup_inc(proto_perl->Iutf8_xidstart, param); PL_utf8_perl_idstart = sv_dup_inc(proto_perl->Iutf8_perl_idstart, param); + PL_utf8_perl_idcont = sv_dup_inc(proto_perl->Iutf8_perl_idcont, param); PL_utf8_idcont = sv_dup_inc(proto_perl->Iutf8_idcont, param); PL_utf8_xidcont = sv_dup_inc(proto_perl->Iutf8_xidcont, param); PL_utf8_foldable = sv_dup_inc(proto_perl->Iutf8_foldable, param); diff --git a/utf8.c b/utf8.c index ec4e627..dfb303f 100644 --- a/utf8.c +++ b/utf8.c @@ -1528,6 +1528,14 @@ Perl_is_uni_idfirst(pTHX_ UV c) } bool +Perl__is_uni_perl_idcont(pTHX_ UV c) +{ + U8 tmpbuf[UTF8_MAXBYTES+1]; + uvchr_to_utf8(tmpbuf, c); + return _is_utf8_perl_idcont(tmpbuf); +} + +bool Perl__is_uni_perl_idstart(pTHX_ UV c) { U8 tmpbuf[UTF8_MAXBYTES+1]; @@ -2120,6 +2128,17 @@ Perl__is_utf8_perl_idstart(pTHX_ const U8 *p) } bool +Perl__is_utf8_perl_idcont(pTHX_ const U8 *p) +{ + dVAR; + + PERL_ARGS_ASSERT__IS_UTF8_PERL_IDCONT; + + return is_utf8_common(p, &PL_utf8_perl_idcont, "_Perl_IDCont"); +} + + +bool Perl_is_utf8_idcont(pTHX_ const U8 *p) { dVAR; -- 2.7.4