From f91dcd139f8093525e1a00c3803de98b1bf89cab Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Wed, 28 Nov 2012 20:16:06 -0700 Subject: [PATCH] Make isIDFIRST_uni() return identically as isIDFIRST_utf8() These two macros should have the same results for the same input code points. Prior to this patch, the _uni() macro returned the official Unicode ID_Start property, and the _utf8() macro returned Perl's slightly restricted definition. Now both return Perl's. --- embed.fnc | 1 + embed.h | 1 + ext/XS-APItest/t/handy.t | 4 ++++ handy.h | 9 ++++----- proto.h | 3 +++ utf8.c | 8 ++++++++ 6 files changed, 21 insertions(+), 5 deletions(-) diff --git a/embed.fnc b/embed.fnc index 3b9767b..a1e1f5e 100644 --- a/embed.fnc +++ b/embed.fnc @@ -626,6 +626,7 @@ Amp |UV |to_uni_fold |UV c|NN U8 *p|NN STRLEN *lenp AMp |UV |_to_uni_fold_flags|UV c|NN U8 *p|NN STRLEN *lenp|const U8 flags ApPR |bool |is_uni_alnum_lc|UV c ApPR |bool |is_uni_idfirst_lc|UV c +AMpR |bool |_is_uni_perl_idstart|UV c ApPR |bool |is_uni_alpha_lc|UV c ApPR |bool |is_uni_ascii_lc|UV c ApPR |bool |is_uni_space_lc|UV c diff --git a/embed.h b/embed.h index 473ab33..e19a9df 100644 --- a/embed.h +++ b/embed.h @@ -27,6 +27,7 @@ /* Hide global symbols */ #define Gv_AMupdate(a,b) Perl_Gv_AMupdate(aTHX_ a,b) +#define _is_uni_perl_idstart(a) Perl__is_uni_perl_idstart(aTHX_ a) #define _is_utf8_perl_idstart(a) Perl__is_utf8_perl_idstart(aTHX_ a) #define _to_uni_fold_flags(a,b,c,d) Perl__to_uni_fold_flags(aTHX_ a,b,c,d) #define _to_utf8_fold_flags(a,b,c,d,e) Perl__to_utf8_fold_flags(aTHX_ a,b,c,d,e) diff --git a/ext/XS-APItest/t/handy.t b/ext/XS-APItest/t/handy.t index d6c8df6..4c852c5 100644 --- a/ext/XS-APItest/t/handy.t +++ b/ext/XS-APItest/t/handy.t @@ -57,6 +57,10 @@ foreach my $name (sort keys %properties) { last if $above_latins > 5; } + # This makes sure we are using the Perl definition of idfirst, and not the + # Unicode. There are a few differences. + push @code_points, ord "\N{ESTIMATED SYMBOL}" if $name eq 'idfirst'; + # And finally one non-Unicode code point. push @code_points, 0x110000; # Above Unicode, no prop should match no warnings 'non_unicode'; diff --git a/handy.h b/handy.h index c03853e..5134c9d 100644 --- a/handy.h +++ b/handy.h @@ -927,7 +927,7 @@ EXTCONST U32 PL_charclass[]; #define isWORDCHAR_uni(c) _generic_uni(_CC_WORDCHAR, is_uni_alnum, c) #define isALNUM_uni(c) isWORDCHAR_uni(c) #define isBLANK_uni(c) _generic_uni(_CC_BLANK, is_HORIZWS_cp_high, c) -#define isIDFIRST_uni(c) _generic_uni(_CC_IDFIRST, is_uni_idfirst, c) +#define isIDFIRST_uni(c) _generic_uni(_CC_IDFIRST, _is_uni_perl_idstart, c) #define isALPHA_uni(c) _generic_uni(_CC_ALPHA, is_uni_alpha, c) #define isSPACE_uni(c) _generic_uni(_CC_SPACE, is_XPERLSPACE_cp_high, c) #define isVERTWS_uni(c) _generic_uni(_CC_VERTSPACE, is_VERTWS_cp_high, c) @@ -992,10 +992,9 @@ EXTCONST U32 PL_charclass[]; /* To prevent S_scan_word in toke.c from hanging, we have to make sure that * IDFIRST is an alnum. See * http://rt.perl.org/rt3/Ticket/Display.html?id=74022 for more detail than you - * ever wanted to know about. XXX It is unclear if this should extend to - * isIDFIRST_uni() which it hasn't so far. (In the ASCII range, there isn't a - * difference.) This used to be not the XID version, but we decided to go with - * the more modern Unicode definition */ + * ever wanted to know about. (In the ASCII range, there isn't a difference.) + * This used to be not the XID version, but we decided to go with the more + * modern Unicode definition */ #define isIDFIRST_utf8(p) _generic_utf8(_CC_IDFIRST, \ _is_utf8_perl_idstart, p) diff --git a/proto.h b/proto.h index 560798a..f9d7b9e 100644 --- a/proto.h +++ b/proto.h @@ -32,6 +32,9 @@ PERL_CALLCONV void Perl_Slab_Free(pTHX_ void *op) #define PERL_ARGS_ASSERT_SLAB_FREE \ assert(op) +PERL_CALLCONV bool Perl__is_uni_perl_idstart(pTHX_ UV c) + __attribute__warn_unused_result__; + PERL_CALLCONV bool Perl__is_utf8_perl_idstart(pTHX_ const U8 *p) __attribute__warn_unused_result__ __attribute__nonnull__(pTHX_1); diff --git a/utf8.c b/utf8.c index 075f4c3..70620af 100644 --- a/utf8.c +++ b/utf8.c @@ -1499,6 +1499,14 @@ Perl_is_uni_idfirst(pTHX_ UV c) } bool +Perl__is_uni_perl_idstart(pTHX_ UV c) +{ + U8 tmpbuf[UTF8_MAXBYTES+1]; + uvchr_to_utf8(tmpbuf, c); + return _is_utf8_perl_idstart(tmpbuf); +} + +bool Perl_is_uni_alpha(pTHX_ UV c) { U8 tmpbuf[UTF8_MAXBYTES+1]; -- 2.7.4