STATIC SV*
S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state,
- const regnode_charclass_posixl_fold* const node)
+ const regnode_charclass_posixl* const node)
{
/* Returns a mortal inversion list defining which code points are matched
* by 'node', which is of type ANYOF. Handles complementing the result if
* possibility. */
SV* invlist = sv_2mortal(_new_invlist(0));
+ SV* only_utf8_locale_invlist = NULL;
unsigned int i;
const U32 n = ARG(node);
bool new_node_has_latin1 = FALSE;
* known until runtime -- we have to assume it could be anything */
return _add_range_to_invlist(invlist, 0, UV_MAX);
}
- else {
+ else if (ary[3] && ary[3] != &PL_sv_undef) {
/* Here no compile-time swash, and no run-time only data. Use the
* node's inversion list */
- invlist = sv_2mortal(invlist_clone(ary[2]));
+ invlist = sv_2mortal(invlist_clone(ary[3]));
+ }
+
+ /* Get the code points valid only under UTF-8 locales */
+ if ((ANYOF_FLAGS(node) & ANYOF_LOC_FOLD)
+ && ary[2] && ary[2] != &PL_sv_undef)
+ {
+ only_utf8_locale_invlist = ary[2];
}
}
_invlist_union(invlist, PL_Latin1, &invlist);
}
- /* Similarly add the UTF-8 locale possible matches */
- if (ANYOF_FLAGS(node) & ANYOF_LOC_FOLD && ANYOF_UTF8_LOCALE_INVLIST(node))
- {
+ /* Similarly add the UTF-8 locale possible matches. These have to be
+ * deferred until after the non-UTF-8 locale ones are taken care of just
+ * above, or it leads to wrong results under ANYOF_INVERT */
+ if (only_utf8_locale_invlist) {
_invlist_union_maybe_complement_2nd(invlist,
- ANYOF_UTF8_LOCALE_INVLIST(node),
+ only_utf8_locale_invlist,
ANYOF_FLAGS(node) & ANYOF_INVERT,
&invlist);
}
}
else {
anded_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state,
- (regnode_charclass_posixl_fold*) and_with);
+ (regnode_charclass_posixl*) and_with);
anded_flags = ANYOF_FLAGS(and_with) & ANYOF_COMMON_FLAGS;
}
* standard, in particular almost everything by Microsoft.
* The loop below just changes e.g., \w into \W and vice versa */
- regnode_charclass_posixl_fold temp;
+ regnode_charclass_posixl temp;
int add = 1; /* To calculate the index of the complement */
ANYOF_POSIXL_ZERO(&temp);
}
else {
ored_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state,
- (regnode_charclass_posixl_fold*) or_with);
+ (regnode_charclass_posixl*) or_with);
ored_flags = ANYOF_FLAGS(or_with) & ANYOF_COMMON_FLAGS;
}
populate_ANYOF_from_invlist( (regnode *) ssc, &invlist);
- set_ANYOF_arg(pRExC_state, (regnode *) ssc, invlist, NULL, NULL, FALSE);
+ set_ANYOF_arg(pRExC_state, (regnode *) ssc, invlist,
+ NULL, NULL, NULL, FALSE);
if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
ANYOF_FLAGS(ssc) |= ANYOF_POSIXL;
* that fold to/from them under /i */
SV* cp_foldable_list = NULL;
+ /* Like cp_list, but code points on this list are valid only when the
+ * runtime locale is UTF-8 */
+ SV* only_utf8_locale_list = NULL;
+
#ifdef EBCDIC
/* In a range, counts how many 0-2 of the ends of it came from literals,
* not escapes. Thus we can tell if 'A' was input vs \x{C1} */
* against. This isn't needed for \p{} and pseudo-classes, as they are
* not affected by locale, and hence are dealt with separately */
if (LOC) {
- if (FOLD && ! need_class) {
- need_class = 1;
- if (SIZE_ONLY) {
- RExC_size += ANYOF_POSIXL_FOLD_SKIP - ANYOF_SKIP;
- }
- else {
- RExC_emit += ANYOF_POSIXL_FOLD_SKIP - ANYOF_SKIP;
- }
-
- /* We need to initialize this here because this node type has
- * this field, and will skip getting initialized when we get to
- * a posix class since are doing it here */
- ANYOF_POSIXL_ZERO(ret);
- }
if (namedclass > OOB_NAMEDCLASS && namedclass < ANYOF_POSIXL_MAX) {
if (! need_class) {
need_class = 1;
* runtime only when the locale indicates Unicode rules. For
* non-locale, we just use to the general list */
if (LOC) {
- use_list = &ANYOF_UTF8_LOCALE_INVLIST(ret);
- *use_list = NULL;
+ use_list = &only_utf8_locale_list;
}
else {
use_list = &cp_list;
* fetching). We know to set the flag if we have a non-NULL list for UTF-8
* locales, or the class matches at least one 0-255 range code point */
if (LOC && FOLD) {
- if (ANYOF_UTF8_LOCALE_INVLIST(ret)) {
+ if (only_utf8_locale_list) {
ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD;
}
else if (cp_list) { /* Look to see if there a 0-255 code point is in
set_ANYOF_arg(pRExC_state, ret, cp_list,
(HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
? listsv : NULL,
+ only_utf8_locale_list,
swash, has_user_defined_property);
*flagp |= HASWIDTH|SIMPLE;
regnode* const node,
SV* const cp_list,
SV* const runtime_defns,
+ SV* const only_utf8_locale_list,
SV* const swash,
const bool has_user_defined_property)
{
* av[1] if &PL_sv_undef, is a placeholder to later contain the swash
* computed from av[0]. But if no further computation need be done,
* the swash is stored here now (and av[0] is &PL_sv_undef).
- * av[2] stores the cp_list inversion list for use in addition or instead
+ * av[2] stores the inversion list of code points that match only if the
+ * current locale is UTF-8
+ * av[3] stores the cp_list inversion list for use in addition or instead
* of av[0]; used only if cp_list exists and av[1] is &PL_sv_undef.
* (Otherwise everything needed is already in av[0] and av[1])
- * av[3] is set if any component of the class is from a user-defined
- * property; used only if av[2] exists */
+ * av[4] is set if any component of the class is from a user-defined
+ * property; used only if av[3] exists */
UV n;
PERL_ARGS_ASSERT_SET_ANYOF_ARG;
- if (! cp_list && ! runtime_defns) {
- assert(! (ANYOF_FLAGS(node) & (ANYOF_UTF8|ANYOF_NONBITMAP_NON_UTF8)));
+ if (! cp_list && ! runtime_defns && ! only_utf8_locale_list) {
+ assert(! (ANYOF_FLAGS(node)
+ & (ANYOF_UTF8|ANYOF_NONBITMAP_NON_UTF8)));
ARG_SET(node, ANYOF_NONBITMAP_EMPTY);
}
else {
AV * const av = newAV();
SV *rv;
- assert(ANYOF_FLAGS(node) & (ANYOF_UTF8|ANYOF_NONBITMAP_NON_UTF8));
+ assert(ANYOF_FLAGS(node)
+ & (ANYOF_UTF8|ANYOF_NONBITMAP_NON_UTF8|ANYOF_LOC_FOLD));
av_store(av, 0, (runtime_defns)
? SvREFCNT_inc(runtime_defns) : &PL_sv_undef);
else {
av_store(av, 1, &PL_sv_undef);
if (cp_list) {
- av_store(av, 2, cp_list);
- av_store(av, 3, newSVuv(has_user_defined_property));
+ av_store(av, 3, cp_list);
+ av_store(av, 4, newSVuv(has_user_defined_property));
}
}
+ if (only_utf8_locale_list) {
+ av_store(av, 2, only_utf8_locale_list);
+ }
+ else {
+ av_store(av, 2, &PL_sv_undef);
+ }
+
rv = newRV_noinc(MUTABLE_SV(av));
n = add_data(pRExC_state, STR_WITH_LEN("s"));
RExC_rxi->data->data[n] = (void*)rv;
}
}
- if ((flags & (ANYOF_ABOVE_LATIN1_ALL|ANYOF_UTF8|ANYOF_NONBITMAP_NON_UTF8))
- || ANYOF_UTF8_LOCALE_INVLIST(o))
+ if ((flags & (ANYOF_ABOVE_LATIN1_ALL
+ |ANYOF_UTF8
+ |ANYOF_NONBITMAP_NON_UTF8
+ |ANYOF_LOC_FOLD)))
{
if (do_sep) {
Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]);
/* output information about the unicode matching */
if (flags & ANYOF_ABOVE_LATIN1_ALL)
sv_catpvs(sv, "{unicode_all}");
- else if (FLAGS(o) & (ANYOF_UTF8|ANYOF_NONBITMAP_NON_UTF8)) {
+ else if (ARG(o) != ANYOF_NONBITMAP_EMPTY) {
SV *lv; /* Set if there is something outside the bit map. */
bool byte_output = FALSE; /* If something in the bitmap has
been output */
+ SV *only_utf8_locale;
/* Get the stuff that wasn't in the bitmap */
- (void) regclass_swash(prog, o, FALSE, &lv, NULL);
+ (void) _get_regclass_nonbitmap_data(prog, o, FALSE,
+ &lv, &only_utf8_locale);
if (lv && lv != &PL_sv_undef) {
char *s = savesvpv(lv);
char * const origs = s;
Safefree(origs);
SvREFCNT_dec_NN(lv);
}
- }
- /* Output any UTF-8 locale code points */
- if (flags & ANYOF_LOC_FOLD && ANYOF_UTF8_LOCALE_INVLIST(o)) {
+ if ((flags & ANYOF_LOC_FOLD)
+ && only_utf8_locale
+ && only_utf8_locale != &PL_sv_undef)
+ {
UV start, end;
int max_entries = 256;
sv_catpvs(sv, "{utf8 locale}");
- invlist_iterinit(ANYOF_UTF8_LOCALE_INVLIST(o));
- while (invlist_iternext(ANYOF_UTF8_LOCALE_INVLIST(o),
+ invlist_iterinit(only_utf8_locale);
+ while (invlist_iternext(only_utf8_locale,
&start, &end)) {
put_range(sv, start, end);
max_entries --;
break;
}
}
- invlist_iterfinish(ANYOF_UTF8_LOCALE_INVLIST(o));
+ invlist_iterfinish(only_utf8_locale);
+ }
}
}
}
else if (PL_regkind[(U8)op] == ANYOF) {
/* arglen 1 + class block */
- node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LOC_FOLD)
- ? ANYOF_POSIXL_FOLD_SKIP
- : (ANYOF_FLAGS(node) & ANYOF_POSIXL)
- ? ANYOF_POSIXL_SKIP
- : ANYOF_SKIP);
+ node += 1 + ((ANYOF_FLAGS(node) & ANYOF_POSIXL)
+ ? ANYOF_POSIXL_SKIP
+ : ANYOF_SKIP);
node = NEXTOPER(node);
}
else if (PL_regkind[(U8)op] == EXACT) {
#define ANYOF_BITMAP_SIZE (256 / 8) /* 8 bits/Byte */
+/* Note that these form structs which are supersets of the next smaller one, by
+ * appending fields. Alignment problems can occur if one of those optional
+ * fields requires stricter alignment than the base struct. And formal
+ * parameters that can really be two or more of the structs should be
+ * declared as the smallest one it could be. See commit message for
+ * 7dcac5f6a5195002b55c935ee1d67f67e1df280b. Regnode allocation is done
+ * without regard to alignment, and changing it to would also require changing
+ * the code that inserts and deletes regnodes. The basic single-argument
+ * regnode has a U32, which is what reganode() allocates as a unit. Therefore
+ * no field can require stricter alignment than U32. */
+
/* also used by trie */
struct regnode_charclass {
U8 flags;
U32 classflags; /* and run-time */
};
-/* like above, but also has folds that are used only if the runtime locale is
- * UTF-8. */
-struct regnode_charclass_posixl_fold {
- U8 flags; /* ANYOF_POSIXL bit must go here */
- U8 type;
- U16 next_off;
- U32 arg1;
- char bitmap[ANYOF_BITMAP_SIZE]; /* both compile-time */
- U32 classflags; /* and run-time */
- SV* utf8_locale_list; /* list of code points matched by folds
- in a UTF-8 locale */
-};
-
/* A synthetic start class; is a regnode_charclass_posixl_fold, plus an extra
* SV*, used only during its construction and which is not used by regexec.c.
* Note that the 'next_off' field is unused, as the SSC stands alone, so there
- * is never a next node. */
+ * is never a next node. Also, there is no alignment issue, becase these are
+ * declared or allocated as a complete unit so the compiler takes care of
+ * alignment. This is unlike ithe other regnodes which are allocated in terms
+ * of multiples of a single-argument regnode. Because there is no alignment
+ * issue, these can have a pointer field */
struct regnode_ssc {
U8 flags; /* ANYOF_POSIXL bit must go here */
U8 type;
U32 arg1;
char bitmap[ANYOF_BITMAP_SIZE]; /* both compile-time */
U32 classflags; /* and run-time */
- SV* utf8_locale_list; /* list of code points matched by folds
- in a UTF-8 locale */
SV* invlist; /* list of code points matched */
};
#define ANYOF_SIZE (sizeof(struct regnode_charclass))
#define ANYOF_POSIXL_SIZE (sizeof(regnode_charclass_posixl))
#define ANYOF_CLASS_SIZE ANYOF_POSIXL_SIZE
-#define ANYOF_POSIXL_FOLD_SIZE (sizeof(regnode_charclass_posixl_fold))
#define ANYOF_FLAGS(p) ((p)->flags)
#define ANYOF_SKIP ((ANYOF_SIZE - 1)/sizeof(regnode))
#define ANYOF_POSIXL_SKIP ((ANYOF_POSIXL_SIZE - 1)/sizeof(regnode))
-#define ANYOF_POSIXL_FOLD_SKIP ((ANYOF_POSIXL_FOLD_SIZE - 1)/sizeof(regnode))
#define ANYOF_CLASS_SKIP ANYOF_POSIXL_SKIP
-#define ANYOF_UTF8_LOCALE_INVLIST(node) (((regnode_charclass_posixl_fold*) (node))->utf8_locale_list)
-
/*
* Utility definitions.
*/
PERL_ARGS_ASSERT__GET_REGCLASS_NONBITMAP_DATA;
- assert(ANYOF_FLAGS(node) & (ANYOF_UTF8|ANYOF_NONBITMAP_NON_UTF8));
+ assert(ANYOF_FLAGS(node)
+ & (ANYOF_UTF8|ANYOF_NONBITMAP_NON_UTF8|ANYOF_LOC_FOLD));
if (data && data->count) {
const U32 n = ARG(node);
si = *ary; /* ary[0] = the string to initialize the swash with */
- /* Elements 2 and 3 are either both present or both absent. [2] is
- * any inversion list generated at compile time; [3] indicates if
+ /* Elements 3 and 4 are either both present or both absent. [3] is
+ * any inversion list generated at compile time; [4] indicates if
* that inversion list has any user-defined properties in it. */
- if (av_len(av) >= 2) {
- invlist = ary[2];
- if (SvUV(ary[3])) {
+ if (av_tindex(av) >= 2) {
+ if (only_utf8_locale_ptr
+ && ary[2]
+ && ary[2] != &PL_sv_undef)
+ {
+ *only_utf8_locale_ptr = ary[2];
+ }
+ else {
+ *only_utf8_locale_ptr = NULL;
+ }
+
+ if (av_len(av) >= 3) {
+ invlist = ary[3];
+ if (SvUV(ary[4])) {
swash_init_flags |= _CORE_SWASH_INIT_USER_DEFINED_PROPERTY;
}
}
else {
invlist = NULL;
}
+ }
/* Element [1] is reserved for the set-up swash. If already there,
* return it; if not, create it and store it there */
}
}
- /* For /li matching and the current locale is a UTF-8 one, look at the
- * special list, valid for just these circumstances. */
- if (! match
- && (flags & ANYOF_LOC_FOLD)
- && IN_UTF8_CTYPE_LOCALE
- && ANYOF_UTF8_LOCALE_INVLIST(n))
- {
- match = _invlist_contains_cp(ANYOF_UTF8_LOCALE_INVLIST(n), c);
- }
/* If the bitmap didn't (or couldn't) match, and something outside the
* bitmap could match, try that. */
match = TRUE; /* Everything above 255 matches */
}
else if ((flags & ANYOF_NONBITMAP_NON_UTF8)
- || (utf8_target && (flags & ANYOF_UTF8)))
+ || (utf8_target && (flags & ANYOF_UTF8))
+ || ((flags & ANYOF_LOC_FOLD)
+ && IN_UTF8_CTYPE_LOCALE
+ && ARG(n) != ANYOF_NONBITMAP_EMPTY))
{
- SV * const sw = _get_regclass_nonbitmap_data(prog, n, TRUE, 0, NULL);
+ SV* only_utf8_locale = NULL;
+ SV * const sw = _get_regclass_nonbitmap_data(prog, n, TRUE, 0,
+ &only_utf8_locale);
if (sw) {
U8 * utf8_p;
if (utf8_target) {
/* If we allocated a string above, free it */
if (! utf8_target) Safefree(utf8_p);
}
+ if (! match && only_utf8_locale && IN_UTF8_CTYPE_LOCALE) {
+ match = _invlist_contains_cp(only_utf8_locale, c);
+ }
}
if (UNICODE_IS_SUPER(c)