ANYOF_CLASS_ZERO(cl);
ANYOF_BITMAP_SETALL(cl);
- cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
+ cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL|ANYOF_LOC_NONBITMAP_FOLD|ANYOF_NON_UTF8_LATIN1_ALL;
if (LOC)
cl->flags |= ANYOF_LOCALE;
- cl->flags |= ANYOF_LOC_NONBITMAP_FOLD;
}
/* Can match anything (initialization) */
if (!(and_with->flags & ANYOF_LOC_NONBITMAP_FOLD))
cl->flags &= ~ANYOF_LOC_NONBITMAP_FOLD;
+ if (!(and_with->flags & ANYOF_NON_UTF8_LATIN1_ALL))
+ cl->flags &= ~ANYOF_NON_UTF8_LATIN1_ALL;
if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_NONBITMAP &&
!(and_with->flags & ANYOF_INVERT)) {
}
if (or_with->flags & ANYOF_EOS)
cl->flags |= ANYOF_EOS;
+ if (!(or_with->flags & ANYOF_NON_UTF8_LATIN1_ALL))
+ cl->flags |= ANYOF_NON_UTF8_LATIN1_ALL;
if (or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
cl->flags |= ANYOF_LOC_NONBITMAP_FOLD;
if (! TEST_7) stored += \
S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value); \
} \
- for (value = 128; value < 256; value++) { \
- S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value); \
- } \
+ /* For a non-ut8 target string with DEPENDS semantics, all above ASCII \
+ * Latin1 code points match the complement of any of the classes. But \
+ * in utf8, they have their Unicode semantics, so can't just set them \
+ * in the bitmap, or else regexec.c will think they matched when they \
+ * shouldn't. */ \
+ ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL|ANYOF_UTF8; \
} \
yesno = '!'; \
what = WORD; \
EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
+ if (flags & ANYOF_NON_UTF8_LATIN1_ALL) {
+ sv_catpvs(sv, "{non-utf8-latin1-all}");
+ }
+
/* output information about the unicode matching */
if (flags & ANYOF_UNICODE_ALL)
sv_catpvs(sv, "{unicode_all}");
/* Matches every code point 0x100 and above*/
#define ANYOF_UNICODE_ALL 0x40
+/* Match all Latin1 characters that aren't ASCII when the target string is not
+ * in utf8. */
+#define ANYOF_NON_UTF8_LATIN1_ALL 0x80
+
#define ANYOF_FLAGS_ALL 0xff
/* Character classes for node->classflags of ANYOF */
([[:upper:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 AB
([[:xdigit:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 ABcd01
([[:^alpha:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 01
-([[:^alnum:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 __-- ${nulnul}${ffff}
+([[:^alnum:]]+) ABcd01Xy__-- ${nulnul}${ffff} yT $1 __-- ${nulnul}${ffff}
([[:^ascii:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 ${ffff}
([[:^cntrl:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 ABcd01Xy__--
([[:^digit:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 ABcd
([[:^lower:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 AB
-([[:^print:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 ${nulnul}${ffff}
+([[:^print:]]+) ABcd01Xy__-- ${nulnul}${ffff} yT $1 ${nulnul}${ffff}
([[:^punct:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 ABcd01Xy
([[:^space:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 ABcd01Xy__--
-([[:^word:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 -- ${nulnul}${ffff}
+([[:^word:]]+) ABcd01Xy__-- ${nulnul}${ffff} yT $1 -- ${nulnul}${ffff}
([[:^upper:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 cd01
([[:^xdigit:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 Xy__-- ${nulnul}${ffff}
[[:foo:]] - c - POSIX class [:foo:] unknown
/\N{U+41}\x{c1}/i a\x{e1} y $& a\x{e1}
/[\N{U+41}\x{c1}]/i \x{e1} y $& \x{e1}
-[\s][\S] \x{a0}\x{a0} nT - - # Unicode complements should not match same character
+[\s][\S] \x{a0}\x{a0} n - - # Unicode complements should not match same character
# was generating malformed utf8
'[\x{100}\xff]'i \x{ff} y $& \x{ff}