From 899d20b99829f8ecdc14e1351b533bc62a354dea Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Thu, 27 Dec 2012 14:35:46 -0700 Subject: [PATCH] regcomp.c: Free up ANYOF flag bit This frees up a flag bit for ANYOF regnodes. The freed bit is currently not needed for other uses; I decided to make the change now, while how to do it was fresh in my mind. There are fewer shifts and masks as a result, as well. This commit moves the information this bit contains to the otherwise unused 'next_off' field in the synthetic start class. This paradigm could be used to pass information to the regex matching code for just the synthetic start class, but the current bit is used just during compilation. --- regcomp.c | 44 ++++++++++++++++++++++++++++---------------- regcomp.h | 29 +++++++++++------------------ 2 files changed, 39 insertions(+), 34 deletions(-) diff --git a/regcomp.c b/regcomp.c index b24adae..9008ae7 100644 --- a/regcomp.c +++ b/regcomp.c @@ -749,6 +749,17 @@ S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *min DEBUG_STUDYDATA("commit: ",data,0); } +/* These macros set, clear and test whether the synthetic start class ('ssc', + * given by the parameter) matches an empty string (EOS). This uses the + * 'next_off' field in the node, to save a bit in the flags field. The ssc + * stands alone, so there is never a next_off, so this field is otherwise + * unused. The EOS information is used only for compilation, but theoretically + * it could be passed on to the execution code. This could be used to store + * more than one bit of information, but only this one is currently used. */ +#define SET_SSC_EOS(node) STMT_START { (node)->next_off = TRUE; } STMT_END +#define CLEAR_SSC_EOS(node) STMT_START { (node)->next_off = FALSE; } STMT_END +#define TEST_SSC_EOS(node) cBOOL((node)->next_off) + /* Can match anything (initialization) */ STATIC void S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl) @@ -756,7 +767,8 @@ S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *c PERL_ARGS_ASSERT_CL_ANYTHING; ANYOF_BITMAP_SETALL(cl); - cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL; + cl->flags = ANYOF_UNICODE_ALL; + SET_SSC_EOS(cl); /* If any portion of the regex is to operate under locale rules, * initialization includes it. The reason this isn't done for all regexes @@ -3140,7 +3152,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, StructCopy(&accum, data->start_class, struct regnode_charclass_class); flags |= SCF_DO_STCLASS_OR; - data->start_class->flags |= ANYOF_EOS; + SET_SSC_EOS(data->start_class); } } @@ -3576,7 +3588,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, } } } - data->start_class->flags &= ~ANYOF_EOS; + CLEAR_SSC_EOS(data->start_class); if (uc < 0x100) data->start_class->flags &= ~ANYOF_UNICODE_ALL; } @@ -3586,7 +3598,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, ANYOF_BITMAP_SET(data->start_class, uc); else data->start_class->flags |= ANYOF_UNICODE_ALL; - data->start_class->flags &= ~ANYOF_EOS; + CLEAR_SSC_EOS(data->start_class); cl_and(data->start_class, and_withp); } flags &= ~SCF_DO_STCLASS; @@ -3635,7 +3647,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, ANYOF_BITMAP_ZERO(data->start_class); if (compat) { ANYOF_BITMAP_SET(data->start_class, uc); - data->start_class->flags &= ~ANYOF_EOS; + CLEAR_SSC_EOS(data->start_class); if (OP(scan) == EXACTFL) { /* XXX This set is probably no longer necessary, and * probably wrong as LOCALE now is on in the initial @@ -3702,7 +3714,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, } } } - data->start_class->flags &= ~ANYOF_EOS; + CLEAR_SSC_EOS(data->start_class); } cl_and(data->start_class, and_withp); } @@ -3819,7 +3831,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, StructCopy(&this_class, data->start_class, struct regnode_charclass_class); flags |= SCF_DO_STCLASS_OR; - data->start_class->flags |= ANYOF_EOS; + SET_SSC_EOS(data->start_class); } } else { /* Non-zero len */ if (flags & SCF_DO_STCLASS_OR) { @@ -4085,7 +4097,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, else if (OP(scan) == LNBREAK) { if (flags & SCF_DO_STCLASS) { int value = 0; - data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */ + CLEAR_SSC_EOS(data->start_class); /* No match on empty */ if (flags & SCF_DO_STCLASS_AND) { for (value = 0; value < 256; value++) if (!is_VERTWS_cp(value)) @@ -4119,7 +4131,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, min++; if (flags & SCF_DO_STCLASS) { int loop_max = 256; - data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */ + CLEAR_SSC_EOS(data->start_class); /* No match on empty */ /* Some of the logic below assumes that switching locale on will only add false positives. */ @@ -4323,11 +4335,11 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, cl_init(pRExC_state, data->start_class); } else { /* AND before and after: combine and continue */ - const int was = (data->start_class->flags & ANYOF_EOS); + const int was = TEST_SSC_EOS(data->start_class); cl_and(data->start_class, &intrnl); if (was) - data->start_class->flags |= ANYOF_EOS; + SET_SSC_EOS(data->start_class); } } } @@ -4395,11 +4407,11 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, *minnextp += min; if (f & SCF_DO_STCLASS_AND) { - const int was = (data->start_class->flags & ANYOF_EOS); + const int was = TEST_SSC_EOS(data.start_class); cl_and(data->start_class, &intrnl); if (was) - data->start_class->flags |= ANYOF_EOS; + SET_SSC_EOS(data->start_class); } if (data) { if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR)) @@ -4602,7 +4614,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, StructCopy(&accum, data->start_class, struct regnode_charclass_class); flags |= SCF_DO_STCLASS_OR; - data->start_class->flags |= ANYOF_EOS; + SET_SSC_EOS(data->start_class); } } scan= tail; @@ -6148,7 +6160,7 @@ reStudy: if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset) && stclass_flag - && !(data.start_class->flags & ANYOF_EOS) + && ! TEST_SSC_EOS(data.start_class) && !cl_is_anything(data.start_class)) { const U32 n = add_data(pRExC_state, 1, "f"); @@ -6220,7 +6232,7 @@ reStudy: r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8 = r->float_substr = r->float_utf8 = NULL; - if (!(data.start_class->flags & ANYOF_EOS) + if (! TEST_SSC_EOS(data.start_class) && !cl_is_anything(data.start_class)) { const U32 n = add_data(pRExC_state, 1, "f"); diff --git a/regcomp.h b/regcomp.h index 1137d67..dca3bfe 100644 --- a/regcomp.h +++ b/regcomp.h @@ -306,20 +306,16 @@ struct regnode_charclass_class { * ANYOF_NONBITMAP_NON_UTF8 bit is also set. */ #define ANYOF_NONBITMAP(node) (ARG(node) != ANYOF_NONBITMAP_EMPTY) -/* Flags for node->flags of ANYOF. These are in short supply, so some games - * are done to share them, as described below. The ANYOF_LOCALE and +/* Flags for node->flags of ANYOF. These are in short supply, but there is one + * currently available. If more than this are needed, the ANYOF_LOCALE and * ANYOF_CLASS bits could be shared, making a space penalty for all locale nodes. - * An option would be to push them into new nodes. E.g. there could be an - * ANYOF_LOCALE node that would be in place of the flag of the same name. But - * there are better options. The UNICODE_ALL bit could be freed up by - * resorting to creating a swash containing everything above 255. This - * introduces a performance penalty. Better would be to split it off into a - * separate node, which actually would improve performance by allowing adding a - * case statement to regexec.c use the bit map for code points under 256, and - * to match everything above. If flags need to be added that are applicable to - * the synthetic start class only, with some work, they could be put in the - * next-node field, or in an unused bit of the classflags field. This could be - * done with the current EOS flag, freeing up that bit */ + * Also, the UNICODE_ALL bit could be freed up by resorting to creating a swash + * containing everything above 255. This introduces a performance penalty. + * Better would be to split it off into a separate node, which actually would + * improve performance a bit by allowing regexec.c to test for a UTF-8 + * character being above 255 without having to call a function nor calculate + * its code point value. However, this solution might need to have a second + * node type, ANYOF_SYNTHETIC_ABOVE_LATIN1_ALL */ #define ANYOF_LOCALE 0x01 /* /l modifier */ @@ -339,9 +335,7 @@ struct regnode_charclass_class { #define ANYOF_CLASS 0x08 #define ANYOF_LARGE ANYOF_CLASS /* Same; name retained for back compat */ -/* EOS, meaning that it can match an empty string too, is used for the - * synthetic start class only. */ -#define ANYOF_EOS 0x10 +/* Unused: 0x10. When using, be sure to change ANYOF_FLAGS_ALL below */ /* Can match something outside the bitmap that isn't in utf8 */ #define ANYOF_NONBITMAP_NON_UTF8 0x20 @@ -353,7 +347,7 @@ struct regnode_charclass_class { * in utf8. */ #define ANYOF_NON_UTF8_LATIN1_ALL 0x80 -#define ANYOF_FLAGS_ALL 0xff +#define ANYOF_FLAGS_ALL (0xff & ~0x10) /* These are the flags that ANYOF_INVERT being set or not doesn't affect * whether they are operative or not. e.g., the node still has LOCALE @@ -362,7 +356,6 @@ struct regnode_charclass_class { #define INVERSION_UNAFFECTED_FLAGS (ANYOF_LOCALE \ |ANYOF_LOC_FOLD \ |ANYOF_CLASS \ - |ANYOF_EOS \ |ANYOF_NONBITMAP_NON_UTF8) /* Character classes for node->classflags of ANYOF */ -- 2.7.4