--- /dev/null
+#!perl
+
+# test CALLREGEXEC()
+# (currently it just checks that it handles non-\0 terminated strings;
+# full tests haven't been added yet)
+
+use warnings;
+use strict;
+
+use XS::APItest;
+*callregexec = *XS::APItest::callregexec;
+
+use Test::More tests => 50;
+
+# Test that the regex engine can handle strings without terminating \0
+# XXX This is by no means comprehensive; it doesn't test all ops, nor all
+# code paths within those ops (especially not utf8).
+
+
+# this sub takes a string that has an extraneous char at the end.
+# First see if the string (less the last char) matches the regex;
+# then see if that string (including the last char) matches when
+# calling callregexec(), but with the length arg set to 1 char less than
+# the length of the string.
+# In theory the result should be the same for both matches, since
+# they should both not 'see' the final char.
+
+sub try {
+ my ($str, $re, $exp, $desc) = @_;
+
+ my $str1 = substr($str, 0, -1);
+ ok !!$exp == !!($str1 =~ $re), "$desc str =~ qr";
+
+ my $bytes = do { use bytes; length $str1 };
+ ok !!$exp == !!callregexec($re, 0, $bytes, 0, $str, 0),
+ "$desc callregexec";
+}
+
+
+{
+ try "\nx", qr/\n^/m, 0, 'MBOL';
+ try "ax", qr/a$/m, 1, 'MEOL';
+ try "ax", qr/a$/s, 1, 'SEOL';
+ try "abx", qr/^(ab|X)./s, 0, 'SANY';
+ try "abx", qr/^(ab|X)\C/, 0, 'CANY';
+ try "abx", qr/^(ab|X)./, 0, 'REG_ANY';
+ try "abx", qr/^ab(c|d|e|x)/, 0, 'TRIE/TRIEC';
+ try "abx", qr/^abx/, 0, 'EXACT';
+ try "abx", qr/^ABX/i, 0, 'EXACTF';
+ try "abx", qr/^ab\b/, 1, 'BOUND';
+ try "ab-", qr/^ab\B/, 0, 'NBOUND';
+ try "aas", qr/a[st]/, 0, 'ANYOF';
+ try "aas", qr/a[s\xDF]/i, 0, 'ANYOFV';
+ try "ab1", qr/ab\d/, 0, 'DIGIT';
+ try "ab\n", qr/ab[[:ascii:]]/, 0, 'POSIX';
+ try "aP\x{307}", qr/^a\X/, 1, 'CLUMP 1';
+ try "aP\x{307}x", qr/^a\X/, 1, 'CLUMP 2';
+ try "\x{100}\r\n", qr/^\x{100}\X/, 1, 'CLUMP 3';
+ try "abb", qr/^a(b)\1/, 0, 'REF';
+ try "ab\n", qr/^.+\R/, 0, 'LNBREAK';
+ try "ab\n", qr/^.+\v/, 0, 'VERTWS';
+ try "abx", qr/^.+\V/, 1, 'NVERTWS';
+ try "ab\t", qr/^.+\h/, 0, 'HORIZWS';
+ try "abx", qr/^.+\H/, 1, 'NHORIZWS';
+ try "abx", qr/a.*x/, 0, 'CURLY';
+}
#define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
#define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
+
+#define NEXTCHR_EOS -10 /* nextchr has fallen off the end */
+#define NEXTCHR_IS_EOS (nextchr < 0)
+
+#define SET_nextchr \
+ nextchr = ((locinput < PL_regeol) ? UCHARAT(locinput) : NEXTCHR_EOS)
+
+#define SET_locinput(p) \
+ locinput = (p); \
+ SET_nextchr
+
+
/* these are unrolled below in the CCC_TRY_XXX defined */
#define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
if (!CAT2(PL_utf8_,class)) { \
* fails, or advance to the next character */
#define _CCC_TRY_CODE(POS_OR_NEG, FUNC, UTF8_TEST, CLASS, STR) \
- if (locinput >= PL_regeol) { \
+ if (NEXTCHR_IS_EOS) { \
sayNO; \
} \
if (utf8_target && UTF8_IS_CONTINUED(nextchr)) { \
_CCC_TRY_CODE( PLACEHOLDER, LCFUNC, LCFUNC_utf8((U8*)locinput), \
CLASS, STR) \
case NAMEA: \
- if (locinput >= PL_regeol || ! FUNCA(nextchr)) { \
+ if (NEXTCHR_IS_EOS || ! FUNCA(nextchr)) { \
sayNO; \
} \
/* Matched a utf8-invariant, so don't have to worry about utf8 */ \
locinput++; \
break; \
case NNAMEA: \
- if (locinput >= PL_regeol || FUNCA(nextchr)) { \
+ if (NEXTCHR_IS_EOS || FUNCA(nextchr)) { \
sayNO; \
} \
goto increment_locinput; \
goto fail;
}
- strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
+ /* XXX we need to pass strbeg as a separate arg: the following is
+ * guesswork and can be wrong... */
+ if (sv && SvPOK(sv)) {
+ char * p = SvPVX(sv);
+ STRLEN cur = SvCUR(sv);
+ if (p <= strpos && strpos < p + cur) {
+ strbeg = p;
+ assert(p <= strend && strend <= p + cur);
+ }
+ else
+ strbeg = strend - cur;
+ }
+ else
+ strbeg = strpos;
+
PL_regeol = strend;
if (utf8_target) {
if (!prog->check_utf8 && prog->check_substr)
#define REXEC_FBC_UTF8_SCAN(CoDe) \
STMT_START { \
- while (s + (uskip = UTF8SKIP(s)) <= strend) { \
+ while (s < strend && s + (uskip = UTF8SKIP(s)) <= strend) { \
CoDe \
s += uskip; \
} \
break;
case LNBREAK:
REXEC_FBC_CSCAN(
- is_LNBREAK_utf8(s),
- is_LNBREAK_latin1(s)
+ is_LNBREAK_utf8_safe(s, strend),
+ is_LNBREAK_latin1_safe(s, strend)
);
break;
case VERTWS:
REXEC_FBC_CSCAN(
- is_VERTWS_utf8(s),
- is_VERTWS_latin1(s)
+ is_VERTWS_utf8_safe(s, strend),
+ is_VERTWS_latin1_safe(s, strend)
);
break;
case NVERTWS:
REXEC_FBC_CSCAN(
- !is_VERTWS_utf8(s),
- !is_VERTWS_latin1(s)
+ !is_VERTWS_utf8_safe(s, strend),
+ !is_VERTWS_latin1_safe(s, strend)
);
break;
case HORIZWS:
REXEC_FBC_CSCAN(
- is_HORIZWS_utf8(s),
- is_HORIZWS_latin1(s)
+ is_HORIZWS_utf8_safe(s, strend),
+ is_HORIZWS_latin1_safe(s, strend)
);
break;
case NHORIZWS:
REXEC_FBC_CSCAN(
- !is_HORIZWS_utf8(s),
- !is_HORIZWS_latin1(s)
+ !is_HORIZWS_utf8_safe(s, strend),
+ !is_HORIZWS_latin1_safe(s, strend)
);
break;
case POSIXA:
}
points[pointpos++ % maxlen]= uc;
- REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
+ if (foldlen || uc < (U8*)strend) {
+ REXEC_TRIE_READ_CHAR(trie_type, trie,
+ widecharmap, uc,
uscan, len, uvc, charid, foldlen,
foldbuf, uniflags);
- DEBUG_TRIE_EXECUTE_r({
- dump_exec_pos( (char *)uc, c, strend, real_start,
- s, utf8_target );
- PerlIO_printf(Perl_debug_log,
- " Charid:%3u CP:%4"UVxf" ",
- charid, uvc);
- });
+ DEBUG_TRIE_EXECUTE_r({
+ dump_exec_pos( (char *)uc, c, strend,
+ real_start, s, utf8_target);
+ PerlIO_printf(Perl_debug_log,
+ " Charid:%3u CP:%4"UVxf" ",
+ charid, uvc);
+ });
+ }
+ else {
+ len = 0;
+ charid = 0;
+ }
+
do {
#ifdef DEBUGGING
while (s <= last1) {
if (regtry(®info, &s))
goto got_it;
- s += UTF8SKIP(s);
+ if (s >= last1) {
+ s++; /* to break out of outer loop */
+ break;
+ }
+ s += UTF8SKIP(s);
}
}
else {
Safefree(prog->offs);
prog->offs = swap;
}
-
return 0;
}
st = PL_regmatch_state = S_push_slab(aTHX);
/* Note that nextchr is a byte even in UTF */
- nextchr = UCHARAT(locinput);
+ SET_nextchr;
scan = prog;
while (scan != NULL) {
reenter_switch:
- nextchr = UCHARAT(locinput);
- assert(nextchr >= 0);
+ SET_nextchr;
switch (state_num) {
case BOL: /* /^../ */
case MBOL: /* /^../m */
if (locinput == PL_bostr ||
- ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
+ (!NEXTCHR_IS_EOS && locinput[-1] == '\n'))
{
break;
}
goto seol;
case MEOL: /* /..$/m */
- if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
+ if (!NEXTCHR_IS_EOS && nextchr != '\n')
sayNO;
break;
case SEOL: /* /..$/s */
seol:
- if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
+ if (!NEXTCHR_IS_EOS && nextchr != '\n')
sayNO;
if (PL_regeol - locinput > 1)
sayNO;
break;
case EOS: /* \z */
- if (PL_regeol != locinput)
+ if (!NEXTCHR_IS_EOS)
sayNO;
break;
case SANY: /* /./s */
- if (!nextchr && locinput >= PL_regeol)
+ if (NEXTCHR_IS_EOS)
sayNO;
goto increment_locinput;
case CANY: /* \C */
- if (!nextchr && locinput >= PL_regeol)
+ if (NEXTCHR_IS_EOS)
sayNO;
locinput++;
break;
case REG_ANY: /* /./ */
- if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
+ if ((NEXTCHR_IS_EOS) || nextchr == '\n')
sayNO;
goto increment_locinput;
/* In this case the charclass data is available inline so
we can fail fast without a lot of extra overhead.
*/
- if(!ANYOF_BITMAP_TEST(scan, nextchr)) {
+ if(!NEXTCHR_IS_EOS && !ANYOF_BITMAP_TEST(scan, nextchr)) {
DEBUG_EXECUTE_r(
PerlIO_printf(Perl_debug_log,
"%*s %sfailed to match trie start class...%s\n",
HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
U32 state = trie->startstate;
- if (trie->bitmap && !TRIE_BITMAP_TEST(trie, nextchr) ) {
+ if ( trie->bitmap
+ && (NEXTCHR_IS_EOS || !TRIE_BITMAP_TEST(trie, nextchr)))
+ {
if (trie->states[ state ].wordnum) {
DEBUG_EXECUTE_r(
PerlIO_printf(Perl_debug_log,
});
/* read a char and goto next state */
- if ( base ) {
+ if ( base && (foldlen || uc < (U8*)PL_regeol)) {
I32 offset;
REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
uscan, len, uvc, charid, foldlen,
locinput += ln;
break;
}
+
case EXACTFL: { /* /abc/il */
re_fold_t folder;
const U8 * fold_array;
}
if (FLAGS(scan) != REGEX_LOCALE_CHARSET) {
ln = isALNUM_uni(ln);
- LOAD_UTF8_CHARCLASS_ALNUM();
- n = swash_fetch(PL_utf8_alnum, (U8*)locinput, utf8_target);
+ if (NEXTCHR_IS_EOS)
+ n = 0;
+ else {
+ LOAD_UTF8_CHARCLASS_ALNUM();
+ n = swash_fetch(PL_utf8_alnum, (U8*)locinput,
+ utf8_target);
+ }
}
else {
ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
- n = isALNUM_LC_utf8((U8*)locinput);
+ n = NEXTCHR_IS_EOS ? 0 : isALNUM_LC_utf8((U8*)locinput);
}
}
else {
switch (FLAGS(scan)) {
case REGEX_UNICODE_CHARSET:
ln = isWORDCHAR_L1(ln);
- n = isWORDCHAR_L1(nextchr);
+ n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_L1(nextchr);
break;
case REGEX_LOCALE_CHARSET:
ln = isALNUM_LC(ln);
- n = isALNUM_LC(nextchr);
+ n = NEXTCHR_IS_EOS ? 0 : isALNUM_LC(nextchr);
break;
case REGEX_DEPENDS_CHARSET:
ln = isALNUM(ln);
- n = isALNUM(nextchr);
+ n = NEXTCHR_IS_EOS ? 0 : isALNUM(nextchr);
break;
case REGEX_ASCII_RESTRICTED_CHARSET:
case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
ln = isWORDCHAR_A(ln);
- n = isWORDCHAR_A(nextchr);
+ n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_A(nextchr);
break;
default:
Perl_croak(aTHX_ "panic: Unexpected FLAGS %u in op %u", FLAGS(scan), OP(scan));
case ANYOFV: /* /[abx{df}]/i */
case ANYOF: /* /[abc]/ */
+ if (NEXTCHR_IS_EOS)
+ sayNO;
if (utf8_target || state_num == ANYOFV) {
STRLEN inclasslen = PL_regeol - locinput;
- if (locinput >= PL_regeol)
- sayNO;
-
if (!reginclass(rex, scan, (U8*)locinput, &inclasslen, utf8_target))
sayNO;
locinput += inclasslen;
break;
}
else {
- if (!nextchr && locinput >= PL_regeol)
- sayNO;
if (!REGINCLASS(rex, scan, (U8*)locinput))
sayNO;
locinput++;
digit, "0");
case POSIXA: /* /[[:ascii:]]/ etc */
- if (locinput >= PL_regeol || ! _generic_isCC_A(nextchr, FLAGS(scan))) {
+ if (NEXTCHR_IS_EOS || ! _generic_isCC_A(nextchr, FLAGS(scan))) {
sayNO;
}
/* Matched a utf8-invariant, so don't have to worry about utf8 */
break;
case NPOSIXA: /* /[^[:ascii:]]/ etc */
- if (locinput >= PL_regeol || _generic_isCC_A(nextchr, FLAGS(scan))) {
+ if (NEXTCHR_IS_EOS || _generic_isCC_A(nextchr, FLAGS(scan))) {
sayNO;
}
goto increment_locinput;
Prepend, that one will be a suitable Begin.
*/
- if (locinput >= PL_regeol)
+ if (NEXTCHR_IS_EOS)
sayNO;
if (! utf8_target) {
/* Utf8: See if is ( CR LF ); already know that locinput <
* PL_regeol, so locinput+1 is in bounds */
- if (nextchr == '\r' && UCHARAT(locinput + 1) == '\n') {
+ if ( nextchr == '\r' && locinput+1 < PL_regeol
+ && UCHARAT(locinput + 1) == '\n')
+ {
locinput += 2;
}
else {
}
/* Not utf8: Inline the first character, for speed. */
- if (UCHARAT(s) != nextchr &&
+ if (!NEXTCHR_IS_EOS &&
+ UCHARAT(s) != nextchr &&
(type == REF ||
UCHARAT(s) != fold_array[nextchr]))
sayNO;
(int)(REPORT_CODE_OFF+(depth*2)),
"", (IV)ST.count)
);
- if (ST.c1 != CHRTEST_VOID
+ if ( !NEXTCHR_IS_EOS
+ && ST.c1 != CHRTEST_VOID
&& nextchr != ST.c1
&& nextchr != ST.c2)
{
if (ST.count == ARG1(ST.me) /* min */)
sayNO;
ST.count--;
- locinput = HOPc(locinput, -ST.alen);
- nextchr = UCHARAT(locinput);
+ SET_locinput(HOPc(locinput, -ST.alen));
goto curlym_do_B; /* try to match B */
#undef ST
minmod = 0;
if (ST.min && regrepeat(rex, &li, ST.A, ST.min, depth) < ST.min)
sayNO;
- locinput = li;
- nextchr = UCHARAT(locinput);
+ SET_locinput(li);
ST.count = ST.min;
REGCP_SET(ST.cp);
if (ST.c1 == CHRTEST_VOID)
ST.count = regrepeat(rex, &li, ST.A, ST.max, depth);
if (ST.count < ST.min)
sayNO;
- locinput = li;
- nextchr = UCHARAT(locinput);
+ SET_locinput(li);
if ((ST.count > ST.min)
&& (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
{
}
{
UV c = 0;
- if (ST.c1 != CHRTEST_VOID)
+ if (ST.c1 != CHRTEST_VOID && locinput < PL_regeol)
c = utf8_target ? utf8n_to_uvchr((U8*)locinput,
UTF8_MAXBYTES, 0, uniflags)
: (UV) UCHARAT(locinput);
/* If it could work, try it. */
- if (ST.c1 == CHRTEST_VOID || c == (UV)ST.c1 || c == (UV)ST.c2) {
+ if (ST.c1 == CHRTEST_VOID
+ || (locinput < PL_regeol &&
+ (c == (UV)ST.c1 || c == (UV)ST.c2)))
+ {
CURLY_SETPAREN(ST.paren, ST.count);
PUSH_STATE_GOTO(CURLY_B_max, ST.B, locinput);
assert(0); /* NOTREACHED */
#undef ST
case LNBREAK: /* \R */
- if ((n=is_LNBREAK(locinput,utf8_target))) {
+ if ((n=is_LNBREAK_safe(locinput, PL_regeol, utf8_target))) {
locinput += n;
} else
sayNO;
#define CASE_CLASS(nAmE) \
case nAmE: \
- if (locinput >= PL_regeol) \
+ if (NEXTCHR_IS_EOS) \
sayNO; \
if ((n=is_##nAmE(locinput,utf8_target))) { \
locinput += n; \
sayNO; \
break; \
case N##nAmE: \
- if (locinput >= PL_regeol) \
+ if (NEXTCHR_IS_EOS) \
sayNO; \
if ((n=is_##nAmE(locinput,utf8_target))) { \
sayNO; \
increment_locinput:
if (utf8_target) {
locinput += PL_utf8skip[nextchr];
+ /* locinput is allowed to go 1 char off the end, but not 2+ */
if (locinput > PL_regeol)
sayNO;
}
case LNBREAK:
if (utf8_target) {
loceol = PL_regeol;
- while (hardcount < max && scan < loceol && (c=is_LNBREAK_utf8(scan))) {
+ while (hardcount < max && scan < loceol &&
+ (c=is_LNBREAK_utf8_safe(scan, loceol))) {
scan += c;
hardcount++;
}
because we have a null terminated string, but we
have to use hardcount in this situation
*/
- while (scan < loceol && (c=is_LNBREAK_latin1(scan))) {
+ while (scan < loceol && (c=is_LNBREAK_latin1_safe(scan, loceol))) {
scan+=c;
hardcount++;
}
case HORIZWS:
if (utf8_target) {
loceol = PL_regeol;
- while (hardcount < max && scan < loceol && (c=is_HORIZWS_utf8(scan))) {
+ while (hardcount < max && scan < loceol &&
+ (c=is_HORIZWS_utf8_safe(scan, loceol)))
+ {
scan += c;
hardcount++;
}
} else {
- while (scan < loceol && is_HORIZWS_latin1(scan))
+ while (scan < loceol && is_HORIZWS_latin1_safe(scan, loceol))
scan++;
}
break;
case NHORIZWS:
if (utf8_target) {
loceol = PL_regeol;
- while (hardcount < max && scan < loceol && !is_HORIZWS_utf8(scan)) {
+ while (hardcount < max && scan < loceol &&
+ !is_HORIZWS_utf8_safe(scan, loceol))
+ {
scan += UTF8SKIP(scan);
hardcount++;
}
} else {
- while (scan < loceol && !is_HORIZWS_latin1(scan))
+ while (scan < loceol && !is_HORIZWS_latin1_safe(scan, loceol))
scan++;
}
case VERTWS:
if (utf8_target) {
loceol = PL_regeol;
- while (hardcount < max && scan < loceol && (c=is_VERTWS_utf8(scan))) {
+ while (hardcount < max && scan < loceol &&
+ (c=is_VERTWS_utf8_safe(scan, loceol)))
+ {
scan += c;
hardcount++;
}
} else {
- while (scan < loceol && is_VERTWS_latin1(scan))
+ while (scan < loceol && is_VERTWS_latin1_safe(scan, loceol))
scan++;
}
case NVERTWS:
if (utf8_target) {
loceol = PL_regeol;
- while (hardcount < max && scan < loceol && !is_VERTWS_utf8(scan)) {
+ while (hardcount < max && scan < loceol &&
+ !is_VERTWS_utf8_safe(scan, loceol))
+ {
scan += UTF8SKIP(scan);
hardcount++;
}
} else {
- while (scan < loceol && !is_VERTWS_latin1(scan))
+ while (scan < loceol && !is_VERTWS_latin1_safe(scan, loceol))
scan++;
}