if (end == UV_MAX && start <= 256) {
ANYOF_FLAGS(node) |= ANYOF_ABOVE_LATIN1_ALL;
}
+ else if (end >= 256) {
+ ANYOF_FLAGS(node) |= ANYOF_UTF8;
+ }
/* Quit if are above what we should change */
if (start > 255) {
else {
cp_list = depends_list;
}
+ ANYOF_FLAGS(ret) |= ANYOF_UTF8;
}
/* If there is a swash and more than one element, we can't use the swash in
PERL_ARGS_ASSERT_SET_ANYOF_ARG;
if (! cp_list && ! runtime_defns) {
+ assert(! (ANYOF_FLAGS(node) & (ANYOF_UTF8|ANYOF_NONBITMAP_NON_UTF8)));
ARG_SET(node, ANYOF_NONBITMAP_EMPTY);
}
else {
AV * const av = newAV();
SV *rv;
+ assert(ANYOF_FLAGS(node) & (ANYOF_UTF8|ANYOF_NONBITMAP_NON_UTF8));
+
av_store(av, 0, (runtime_defns)
? SvREFCNT_inc(runtime_defns) : &PL_sv_undef);
if (swash) {
}
}
- if ((flags & ANYOF_ABOVE_LATIN1_ALL)
- || ANYOF_UTF8_LOCALE_INVLIST(o) || ANYOF_NONBITMAP(o))
+ if ((flags & (ANYOF_ABOVE_LATIN1_ALL|ANYOF_UTF8|ANYOF_NONBITMAP_NON_UTF8))
+ || ANYOF_UTF8_LOCALE_INVLIST(o))
{
if (do_sep) {
Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]);
/* output information about the unicode matching */
if (flags & ANYOF_ABOVE_LATIN1_ALL)
sv_catpvs(sv, "{unicode_all}");
- else if (ANYOF_NONBITMAP(o)) {
+ else if (FLAGS(o) & (ANYOF_UTF8|ANYOF_NONBITMAP_NON_UTF8)) {
SV *lv; /* Set if there is something outside the bit map. */
bool byte_output = FALSE; /* If something in the bitmap has
been output */
* reach this high). */
#define ANYOF_NONBITMAP_EMPTY ((U32) -1)
-/* The information used to be stored as as combination of the ANYOF_UTF8 and
- * ANYOF_NONBITMAP_NON_UTF8 bits in the flags field, but was moved out of there
- * to free up a bit for other uses. This tries to hide the change from
- * existing code as much as possible. Now, the data structure that goes in ARG
- * is not allocated unless it is needed, and that is what is used to determine
- * if there is something outside the bitmap. The code now assumes that if
- * that structure exists, that any UTF-8 encoded string should be tried against
- * it, but a non-UTF8-encoded string will be tried only if the
- * ANYOF_NONBITMAP_NON_UTF8 bit is also set. */
-#define ANYOF_NONBITMAP(node) (ARG(node) != ANYOF_NONBITMAP_EMPTY)
-/* Flags for node->flags of ANYOF. These are in short supply, with one
+/* Flags for node->flags of ANYOF. These are in short supply, with none
* currently available. The ABOVE_LATIN1_ALL bit could be freed up
* by resorting to creating a swash containing everything above 255. This
* introduces a performance penalty. An option that wouldn't slow things down
* regex compilation. */
#define ANYOF_EMPTY_STRING ANYOF_INVERT
-/* spare 0x02 */
+/* Are there things that will match only if the target string is encoded in
+ * UTF-8? (This is not set if ANYOF_AOVE_LATIN1_ALL is set) */
+#define ANYOF_UTF8 0x02
/* The fold is calculated and stored in the bitmap where possible at compile
* time. However under locale, the actual folding varies depending on
* in utf8. */
#define ANYOF_NON_UTF8_NON_ASCII_ALL 0x80
-#define ANYOF_FLAGS_ALL (0xf5)
+#define ANYOF_FLAGS_ALL (0xff)
#define ANYOF_LOCALE_FLAGS (ANYOF_LOC_FOLD | ANYOF_POSIXL)
/* These are the flags that apply to both regular ANYOF nodes and synthetic
* start class nodes during construction of the SSC. During finalization of
* the SSC, other of the flags could be added to it */
-#define ANYOF_COMMON_FLAGS (ANYOF_WARN_SUPER)
+#define ANYOF_COMMON_FLAGS (ANYOF_WARN_SUPER|ANYOF_UTF8)
/* Character classes for node->classflags of ANYOF */
/* Should be synchronized with a table in regprop() */
PERL_ARGS_ASSERT_CORE_REGCLASS_SWASH;
- assert(ANYOF_NONBITMAP(node));
+ assert(ANYOF_FLAGS(node) & (ANYOF_UTF8|ANYOF_NONBITMAP_NON_UTF8));
if (data && data->count) {
const U32 n = ARG(node);
}
/* If the bitmap didn't (or couldn't) match, and something outside the
- * bitmap could match, try that. Locale nodes specify completely the
- * behavior of code points in the bit map (otherwise, a utf8 target would
- * cause them to be treated as Unicode and not locale), except in
- * the very unlikely event when this node is a synthetic start class, which
- * could be a combination of locale and non-locale nodes. So allow locale
- * to match for the synthetic start class, which will give a false
- * positive that will be resolved when the match is done again as not part
- * of the synthetic start class */
+ * bitmap could match, try that. */
if (!match) {
if (c >= 256 && (flags & ANYOF_ABOVE_LATIN1_ALL)) {
match = TRUE; /* Everything above 255 matches */
}
- else if (ANYOF_NONBITMAP(n)
- && ((flags & ANYOF_NONBITMAP_NON_UTF8)
- || (utf8_target
- && (c >=256
- || (! (flags & ANYOF_LOCALE_FLAGS))
- || is_ANYOF_SYNTHETIC(n)))))
- {
+ else if ((flags & ANYOF_NONBITMAP_NON_UTF8)
+ || (utf8_target && (flags & ANYOF_UTF8)))
+ {
SV * const sw = core_regclass_swash(prog, n, TRUE, 0);
if (sw) {
U8 * utf8_p;