From 3b04b210101efbbbdf1d8095e181c4218cdf59c2 Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Mon, 17 Feb 2014 15:39:12 -0700 Subject: [PATCH] Change method of passing some info from regcomp to regexec For the last several releases, the fact that an ANYOF node could match something outside its bitmap has been passed to regexec.c by having its ARG field not be -1 (appropriately cast). A bit was set if the match could occur even if the target string was not UTF-8 encoded. This design was used to save a bit, as previously there was a bit also for it matching UTF-8 strings. That design is no longer tenable, as a future commit will have a third (independent) reason for something to match outside the bitmap, This commits uses the current spare bit flag to indicate if the match can only occur if the target string is UTF-8. --- regcomp.c | 13 ++++++++++--- regcomp.h | 20 ++++++-------------- regexec.c | 21 +++++---------------- 3 files changed, 21 insertions(+), 33 deletions(-) diff --git a/regcomp.c b/regcomp.c index 6382c78..f7bac3d 100644 --- a/regcomp.c +++ b/regcomp.c @@ -12351,6 +12351,9 @@ S_populate_ANYOF_from_invlist(pTHX_ regnode *node, SV** invlist_ptr) if (end == UV_MAX && start <= 256) { ANYOF_FLAGS(node) |= ANYOF_ABOVE_LATIN1_ALL; } + else if (end >= 256) { + ANYOF_FLAGS(node) |= ANYOF_UTF8; + } /* Quit if are above what we should change */ if (start > 255) { @@ -14792,6 +14795,7 @@ parseit: else { cp_list = depends_list; } + ANYOF_FLAGS(ret) |= ANYOF_UTF8; } /* If there is a swash and more than one element, we can't use the swash in @@ -14845,12 +14849,15 @@ S_set_ANYOF_arg(pTHX_ RExC_state_t* const pRExC_state, PERL_ARGS_ASSERT_SET_ANYOF_ARG; if (! cp_list && ! runtime_defns) { + assert(! (ANYOF_FLAGS(node) & (ANYOF_UTF8|ANYOF_NONBITMAP_NON_UTF8))); ARG_SET(node, ANYOF_NONBITMAP_EMPTY); } else { AV * const av = newAV(); SV *rv; + assert(ANYOF_FLAGS(node) & (ANYOF_UTF8|ANYOF_NONBITMAP_NON_UTF8)); + av_store(av, 0, (runtime_defns) ? SvREFCNT_inc(runtime_defns) : &PL_sv_undef); if (swash) { @@ -15665,8 +15672,8 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) } } - if ((flags & ANYOF_ABOVE_LATIN1_ALL) - || ANYOF_UTF8_LOCALE_INVLIST(o) || ANYOF_NONBITMAP(o)) + if ((flags & (ANYOF_ABOVE_LATIN1_ALL|ANYOF_UTF8|ANYOF_NONBITMAP_NON_UTF8)) + || ANYOF_UTF8_LOCALE_INVLIST(o)) { if (do_sep) { Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); @@ -15682,7 +15689,7 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) /* output information about the unicode matching */ if (flags & ANYOF_ABOVE_LATIN1_ALL) sv_catpvs(sv, "{unicode_all}"); - else if (ANYOF_NONBITMAP(o)) { + else if (FLAGS(o) & (ANYOF_UTF8|ANYOF_NONBITMAP_NON_UTF8)) { SV *lv; /* Set if there is something outside the bit map. */ bool byte_output = FALSE; /* If something in the bitmap has been output */ diff --git a/regcomp.h b/regcomp.h index ad688d2..1b00c20 100644 --- a/regcomp.h +++ b/regcomp.h @@ -347,17 +347,7 @@ struct regnode_ssc { * reach this high). */ #define ANYOF_NONBITMAP_EMPTY ((U32) -1) -/* The information used to be stored as as combination of the ANYOF_UTF8 and - * ANYOF_NONBITMAP_NON_UTF8 bits in the flags field, but was moved out of there - * to free up a bit for other uses. This tries to hide the change from - * existing code as much as possible. Now, the data structure that goes in ARG - * is not allocated unless it is needed, and that is what is used to determine - * if there is something outside the bitmap. The code now assumes that if - * that structure exists, that any UTF-8 encoded string should be tried against - * it, but a non-UTF8-encoded string will be tried only if the - * ANYOF_NONBITMAP_NON_UTF8 bit is also set. */ -#define ANYOF_NONBITMAP(node) (ARG(node) != ANYOF_NONBITMAP_EMPTY) -/* Flags for node->flags of ANYOF. These are in short supply, with one +/* Flags for node->flags of ANYOF. These are in short supply, with none * currently available. The ABOVE_LATIN1_ALL bit could be freed up * by resorting to creating a swash containing everything above 255. This * introduces a performance penalty. An option that wouldn't slow things down @@ -380,7 +370,9 @@ struct regnode_ssc { * regex compilation. */ #define ANYOF_EMPTY_STRING ANYOF_INVERT -/* spare 0x02 */ +/* Are there things that will match only if the target string is encoded in + * UTF-8? (This is not set if ANYOF_AOVE_LATIN1_ALL is set) */ +#define ANYOF_UTF8 0x02 /* The fold is calculated and stored in the bitmap where possible at compile * time. However under locale, the actual folding varies depending on @@ -411,14 +403,14 @@ struct regnode_ssc { * in utf8. */ #define ANYOF_NON_UTF8_NON_ASCII_ALL 0x80 -#define ANYOF_FLAGS_ALL (0xf5) +#define ANYOF_FLAGS_ALL (0xff) #define ANYOF_LOCALE_FLAGS (ANYOF_LOC_FOLD | ANYOF_POSIXL) /* These are the flags that apply to both regular ANYOF nodes and synthetic * start class nodes during construction of the SSC. During finalization of * the SSC, other of the flags could be added to it */ -#define ANYOF_COMMON_FLAGS (ANYOF_WARN_SUPER) +#define ANYOF_COMMON_FLAGS (ANYOF_WARN_SUPER|ANYOF_UTF8) /* Character classes for node->classflags of ANYOF */ /* Should be synchronized with a table in regprop() */ diff --git a/regexec.c b/regexec.c index c31ae76..28b0bb9 100644 --- a/regexec.c +++ b/regexec.c @@ -7532,7 +7532,7 @@ S_core_regclass_swash(pTHX_ const regexp *prog, const regnode* node, bool doinit PERL_ARGS_ASSERT_CORE_REGCLASS_SWASH; - assert(ANYOF_NONBITMAP(node)); + assert(ANYOF_FLAGS(node) & (ANYOF_UTF8|ANYOF_NONBITMAP_NON_UTF8)); if (data && data->count) { const U32 n = ARG(node); @@ -7720,25 +7720,14 @@ S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const } /* If the bitmap didn't (or couldn't) match, and something outside the - * bitmap could match, try that. Locale nodes specify completely the - * behavior of code points in the bit map (otherwise, a utf8 target would - * cause them to be treated as Unicode and not locale), except in - * the very unlikely event when this node is a synthetic start class, which - * could be a combination of locale and non-locale nodes. So allow locale - * to match for the synthetic start class, which will give a false - * positive that will be resolved when the match is done again as not part - * of the synthetic start class */ + * bitmap could match, try that. */ if (!match) { if (c >= 256 && (flags & ANYOF_ABOVE_LATIN1_ALL)) { match = TRUE; /* Everything above 255 matches */ } - else if (ANYOF_NONBITMAP(n) - && ((flags & ANYOF_NONBITMAP_NON_UTF8) - || (utf8_target - && (c >=256 - || (! (flags & ANYOF_LOCALE_FLAGS)) - || is_ANYOF_SYNTHETIC(n))))) - { + else if ((flags & ANYOF_NONBITMAP_NON_UTF8) + || (utf8_target && (flags & ANYOF_UTF8))) + { SV * const sw = core_regclass_swash(prog, n, TRUE, 0); if (sw) { U8 * utf8_p; -- 2.7.4