From e4e94b48fb015fb6cfcd4fa6fb94ce0523715a9c Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Fri, 6 Jan 2012 14:38:37 -0700 Subject: [PATCH] regcomp.c; Use Latin1 \p{} in optimization This commit causes any Latin1-range characters from Unicode properties to be placed at compile time into the bitmap of the ANYOF node that implements those properties, and to remove the flag that says they should be looked for at run time. This causes the optimizer to generate a better start class, as it knows more fully which characters can be and can't be in the start class, and speeds up runtime checking, as it can just do a bitmap test for these, instead of having to go look at the swash. --- regcomp.c | 62 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 57 insertions(+), 5 deletions(-) diff --git a/regcomp.c b/regcomp.c index c2cc4c4..5273b22 100644 --- a/regcomp.c +++ b/regcomp.c @@ -10385,10 +10385,6 @@ parseit: Safefree(name); } RExC_parse = e + 1; - - /* The \p could match something in the Latin1 range, hence - * something that isn't utf8 */ - ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP_NON_UTF8; namedclass = ANYOF_MAX; /* no official name, but it's named */ /* \p means they want Unicode semantics */ @@ -10914,9 +10910,65 @@ parseit: } } + /* Here, contains all the code points we can determine at + * compile time that we haven't put into the bitmap. Go through it, and + * for things that belong in the bitmap, put them there, and delete from + * */ + if (nonbitmap) { + + /* Above-ASCII code points in /d have to stay in , as they + * possibly only should match when the target string is UTF-8 */ + UV max_cp_to_set = (DEPENDS_SEMANTICS) ? 127 : 255; + + /* This gets set if we actually need to modify things */ + bool change_invlist = FALSE; + + UV start, end; + + /* Start looking through */ + invlist_iterinit(nonbitmap); + while (invlist_iternext(nonbitmap, &start, &end)) { + UV high; + int i; + + /* Quit if are above what we should change */ + if (start > max_cp_to_set) { + break; + } + + change_invlist = TRUE; + + /* Set all the bits in the range, up to the max that we are doing */ + high = (end < max_cp_to_set) ? end : max_cp_to_set; + for (i = start; i <= (int) high; i++) { + if (! ANYOF_BITMAP_TEST(ret, i)) { + ANYOF_BITMAP_SET(ret, i); + stored++; + prevvalue = value; + value = i; + } + } + } + + /* Done with loop; set to not include any code points that + * are in the bitmap */ + if (change_invlist) { + SV* keep_list = _new_invlist(2); + _append_range_to_invlist(keep_list, max_cp_to_set + 1, UV_MAX); + _invlist_intersection(nonbitmap, keep_list, &nonbitmap); + SvREFCNT_dec(keep_list); + } + + /* If have completely emptied it, remove it completely */ + if (invlist_len(nonbitmap) == 0) { + SvREFCNT_dec(nonbitmap); + nonbitmap = NULL; + } + } /* Here, we have calculated what code points should be in the character - * class. + * class. does not overlap the bitmap except possibly in the + * case of DEPENDS rules. * * Now we can see about various optimizations. Fold calculation (which we * did above) needs to take place before inversion. Otherwise /[^k]/i -- 2.7.4