stop regex engine reading beyond end of string

author David Mitchell <davem@iabyn.com>

Fri, 21 Sep 2012 09:29:04 +0000 (10:29 +0100)

committer David Mitchell <davem@iabyn.com>

Wed, 26 Sep 2012 08:41:10 +0000 (09:41 +0100)
author David Mitchell <davem@iabyn.com>
Fri, 21 Sep 2012 09:29:04 +0000 (10:29 +0100)
committer David Mitchell <davem@iabyn.com>
Wed, 26 Sep 2012 08:41:10 +0000 (09:41 +0100)
diff --git a/MANIFEST b/MANIFEST

index a3e752b..a6884d0 100644 (file)
--- a/MANIFEST
+++ b/MANIFEST
@@ -3989,6 +3989,7 @@ ext/XS-APItest/t/blockhooks.t     XS::APItest: tests for PL_blockhooks
  ext/XS-APItest/t/Block.pm      Helper for ./blockhooks.t
  ext/XS-APItest/t/call_checker.t        test call checker plugin API
  ext/XS-APItest/t/caller.t      XS::APItest: tests for caller_cx
+ext/XS-APItest/t/callregexec.t XS::APItest: tests for CALLREGEXEC()
  ext/XS-APItest/t/call.t                XS::APItest extension
  ext/XS-APItest/t/check_warnings.t      test scope of "Too late for CHECK"
  ext/XS-APItest/t/cleanup.t     test stack behaviour on unwinding
diff --git a/ext/XS-APItest/APItest.pm b/ext/XS-APItest/APItest.pm

index 749af95..f33b80b 100644 (file)
--- a/ext/XS-APItest/APItest.pm
+++ b/ext/XS-APItest/APItest.pm
@@ -5,7 +5,7 @@ use strict;
  use warnings;
  use Carp;
  
-our $VERSION = '0.43';
+our $VERSION = '0.44';
  
  require XSLoader;
  
diff --git a/ext/XS-APItest/APItest.xs b/ext/XS-APItest/APItest.xs

index 08694e6..357b033 100644 (file)
--- a/ext/XS-APItest/APItest.xs
+++ b/ext/XS-APItest/APItest.xs
@@ -3407,6 +3407,30 @@ CODE:
  OUTPUT:
      RETVAL
  
+ # provide access to CALLREGEXEC, except replace pointers within the
+ # string with offsets from the start of the string
+
+I32
+callregexec(SV *prog, STRLEN stringarg, STRLEN strend, I32 minend, SV *sv, U32 nosave)
+CODE:
+    {
+       STRLEN len;
+       char *strbeg;
+       if (SvROK(prog))
+           prog = SvRV(prog);
+       strbeg = SvPV_force(sv, len);
+       RETVAL = CALLREGEXEC((REGEXP *)prog,
+                           strbeg + stringarg,
+                           strbeg + strend,
+                           strbeg,
+                           minend,
+                           sv,
+                           NULL, /* data */
+                           nosave);
+    }
+OUTPUT:
+    RETVAL
+
  
  MODULE = XS::APItest PACKAGE = XS::APItest::AUTOLOADtest
  
diff --git a/ext/XS-APItest/t/callregexec.t b/ext/XS-APItest/t/callregexec.t

new file mode 100644 (file)

index 0000000..3111390
--- /dev/null
+++ b/ext/XS-APItest/t/callregexec.t
@@ -0,0 +1,66 @@
+#!perl
+
+# test CALLREGEXEC()
+# (currently it just checks that it handles non-\0 terminated strings;
+# full tests haven't been added yet)
+
+use warnings;
+use strict;
+
+use XS::APItest;
+*callregexec = *XS::APItest::callregexec;
+
+use Test::More tests => 50;
+
+# Test that the regex engine can handle strings without terminating \0
+# XXX This is by no means comprehensive; it doesn't test all ops, nor all
+# code paths within those ops (especially not utf8).
+
+
+# this sub takes a string that has an extraneous char at the end.
+# First see if the string (less the last char) matches the regex;
+# then see if that string (including the last char) matches when
+# calling callregexec(), but with the length arg set to 1 char less than
+# the length of the string.
+# In theory the result should be the same for both matches, since
+# they should both not 'see' the final char.
+
+sub try {
+    my ($str, $re, $exp, $desc) = @_;
+
+    my $str1 = substr($str, 0, -1);
+    ok !!$exp == !!($str1 =~ $re), "$desc str =~ qr";
+
+    my $bytes = do { use bytes; length $str1 };
+    ok  !!$exp == !!callregexec($re, 0, $bytes, 0, $str, 0),
+           "$desc callregexec";
+}
+
+
+{
+    try "\nx",         qr/\n^/m,          0, 'MBOL';
+    try "ax",          qr/a$/m,           1, 'MEOL';
+    try "ax",          qr/a$/s,           1, 'SEOL';
+    try "abx",         qr/^(ab|X)./s,     0, 'SANY';
+    try "abx",         qr/^(ab|X)\C/,     0, 'CANY';
+    try "abx",         qr/^(ab|X)./,      0, 'REG_ANY';
+    try "abx",         qr/^ab(c|d|e|x)/,  0, 'TRIE/TRIEC';
+    try "abx",         qr/^abx/,          0, 'EXACT';
+    try "abx",         qr/^ABX/i,         0, 'EXACTF';
+    try "abx",         qr/^ab\b/,         1, 'BOUND';
+    try "ab-",         qr/^ab\B/,         0, 'NBOUND';
+    try "aas",         qr/a[st]/,         0, 'ANYOF';
+    try "aas",         qr/a[s\xDF]/i,     0, 'ANYOFV';
+    try "ab1",         qr/ab\d/,          0, 'DIGIT';
+    try "ab\n",        qr/ab[[:ascii:]]/, 0, 'POSIX';
+    try "aP\x{307}",   qr/^a\X/,          1, 'CLUMP 1';
+    try "aP\x{307}x",  qr/^a\X/,          1, 'CLUMP 2';
+    try "\x{100}\r\n", qr/^\x{100}\X/,    1, 'CLUMP 3';
+    try "abb",         qr/^a(b)\1/,       0, 'REF';
+    try "ab\n",        qr/^.+\R/,         0, 'LNBREAK';
+    try "ab\n",        qr/^.+\v/,         0, 'VERTWS';
+    try "abx",         qr/^.+\V/,         1, 'NVERTWS';
+    try "ab\t",        qr/^.+\h/,         0, 'HORIZWS';
+    try "abx",         qr/^.+\H/,         1, 'NHORIZWS';
+    try "abx",         qr/a.*x/,          0, 'CURLY';
+}
diff --git a/regexec.c b/regexec.c

index 6ace8b6..989affa 100644 (file)
--- a/regexec.c
+++ b/regexec.c
@@ -121,6 +121,18 @@
  #define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
  #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
  
+
+#define NEXTCHR_EOS -10 /* nextchr has fallen off the end */
+#define NEXTCHR_IS_EOS (nextchr < 0)
+
+#define SET_nextchr \
+    nextchr = ((locinput < PL_regeol) ? UCHARAT(locinput) : NEXTCHR_EOS)
+
+#define SET_locinput(p) \
+    locinput = (p);  \
+    SET_nextchr
+
+
  /* these are unrolled below in the CCC_TRY_XXX defined */
  #define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
      if (!CAT2(PL_utf8_,class)) { \
@@ -165,7 +177,7 @@
   * fails, or advance to the next character */
  
  #define _CCC_TRY_CODE(POS_OR_NEG, FUNC, UTF8_TEST, CLASS, STR)                \
-    if (locinput >= PL_regeol) {                                              \
+    if (NEXTCHR_IS_EOS) {                                              \
         sayNO;                                                                \
      }                                                                         \
      if (utf8_target && UTF8_IS_CONTINUED(nextchr)) {                          \
@@ -219,14 +231,14 @@
         _CCC_TRY_CODE( PLACEHOLDER, LCFUNC, LCFUNC_utf8((U8*)locinput),       \
                        CLASS, STR)                                            \
      case NAMEA:                                                               \
-       if (locinput >= PL_regeol || ! FUNCA(nextchr)) {                      \
+       if (NEXTCHR_IS_EOS || ! FUNCA(nextchr)) {                      \
             sayNO;                                                            \
         }                                                                     \
         /* Matched a utf8-invariant, so don't have to worry about utf8 */     \
         locinput++;                                        \
         break;                                                                \
      case NNAMEA:                                                              \
-       if (locinput >= PL_regeol || FUNCA(nextchr)) {                        \
+       if (NEXTCHR_IS_EOS || FUNCA(nextchr)) {                        \
             sayNO;                                                            \
         }                                                                     \
          goto increment_locinput;                                              \
@@ -597,7 +609,21 @@ Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos,
         goto fail;
      }
                  
-    strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
+    /* XXX we need to pass strbeg as a separate arg: the following is
+     * guesswork and can be wrong... */
+    if (sv && SvPOK(sv)) {
+        char * p   = SvPVX(sv);
+        STRLEN cur = SvCUR(sv); 
+        if (p <= strpos && strpos < p + cur) {
+            strbeg = p;
+            assert(p <= strend && strend <= p + cur);
+        }
+        else
+            strbeg = strend - cur;
+    }
+    else 
+        strbeg = strpos;
+
      PL_regeol = strend;
      if (utf8_target) {
         if (!prog->check_utf8 && prog->check_substr)
@@ -1238,7 +1264,7 @@ STMT_START {                                              \
  
  #define REXEC_FBC_UTF8_SCAN(CoDe)                     \
  STMT_START {                                          \
-    while (s + (uskip = UTF8SKIP(s)) <= strend) {     \
+    while (s < strend && s + (uskip = UTF8SKIP(s)) <= strend) {     \
         CoDe                                          \
         s += uskip;                                   \
      }                                                 \
@@ -1763,32 +1789,32 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
             break;
         case LNBREAK:
             REXEC_FBC_CSCAN(
-               is_LNBREAK_utf8(s),
-               is_LNBREAK_latin1(s)
+               is_LNBREAK_utf8_safe(s, strend),
+               is_LNBREAK_latin1_safe(s, strend)
             );
             break;
         case VERTWS:
             REXEC_FBC_CSCAN(
-               is_VERTWS_utf8(s),
-               is_VERTWS_latin1(s)
+               is_VERTWS_utf8_safe(s, strend),
+               is_VERTWS_latin1_safe(s, strend)
             );
             break;
         case NVERTWS:
             REXEC_FBC_CSCAN(
-               !is_VERTWS_utf8(s),
-               !is_VERTWS_latin1(s)
+               !is_VERTWS_utf8_safe(s, strend),
+               !is_VERTWS_latin1_safe(s, strend)
             );
             break;
         case HORIZWS:
             REXEC_FBC_CSCAN(
-               is_HORIZWS_utf8(s),
-               is_HORIZWS_latin1(s)
+               is_HORIZWS_utf8_safe(s, strend),
+               is_HORIZWS_latin1_safe(s, strend)
             );
             break;
         case NHORIZWS:
             REXEC_FBC_CSCAN(
-               !is_HORIZWS_utf8(s),
-               !is_HORIZWS_latin1(s)
+               !is_HORIZWS_utf8_safe(s, strend),
+               !is_HORIZWS_latin1_safe(s, strend)
             );      
             break;
         case POSIXA:
@@ -1923,16 +1949,24 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
                              
                          }
                          points[pointpos++ % maxlen]= uc;
-                       REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
+                        if (foldlen || uc < (U8*)strend) {
+                            REXEC_TRIE_READ_CHAR(trie_type, trie,
+                                             widecharmap, uc,
                                              uscan, len, uvc, charid, foldlen,
                                              foldbuf, uniflags);
-                        DEBUG_TRIE_EXECUTE_r({
-                            dump_exec_pos( (char *)uc, c, strend, real_start, 
-                                s,   utf8_target );
-                            PerlIO_printf(Perl_debug_log,
-                                " Charid:%3u CP:%4"UVxf" ",
-                                 charid, uvc);
-                        });
+                            DEBUG_TRIE_EXECUTE_r({
+                                dump_exec_pos( (char *)uc, c, strend,
+                                            real_start, s, utf8_target);
+                                PerlIO_printf(Perl_debug_log,
+                                    " Charid:%3u CP:%4"UVxf" ",
+                                     charid, uvc);
+                            });
+                        }
+                        else {
+                            len = 0;
+                            charid = 0;
+                        }
+
  
                          do {
  #ifdef DEBUGGING
@@ -2380,7 +2414,11 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, register char *stre
                 while (s <= last1) {
                     if (regtry(&reginfo, &s))
                         goto got_it;
-                   s += UTF8SKIP(s);
+                    if (s >= last1) {
+                        s++; /* to break out of outer loop */
+                        break;
+                    }
+                    s += UTF8SKIP(s);
                 }
             }
             else {
@@ -2682,7 +2720,6 @@ phooey:
          Safefree(prog->offs);
          prog->offs = swap;
      }
-
      return 0;
  }
  
@@ -3288,7 +3325,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
         st = PL_regmatch_state = S_push_slab(aTHX);
  
      /* Note that nextchr is a byte even in UTF */
-    nextchr = UCHARAT(locinput);
+    SET_nextchr;
      scan = prog;
      while (scan != NULL) {
  
@@ -3313,8 +3350,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
  
        reenter_switch:
  
-        nextchr = UCHARAT(locinput);
-        assert(nextchr >= 0);
+        SET_nextchr;
  
         switch (state_num) {
         case BOL: /*  /^../  */
@@ -3327,7 +3363,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
  
         case MBOL: /*  /^../m  */
             if (locinput == PL_bostr ||
-               ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
+               (!NEXTCHR_IS_EOS && locinput[-1] == '\n'))
             {
                 break;
             }
@@ -3359,36 +3395,36 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
                 goto seol;
  
         case MEOL: /* /..$/m  */
-           if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
+           if (!NEXTCHR_IS_EOS && nextchr != '\n')
                 sayNO;
             break;
  
         case SEOL: /* /..$/s  */
           seol:
-           if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
+           if (!NEXTCHR_IS_EOS && nextchr != '\n')
                 sayNO;
             if (PL_regeol - locinput > 1)
                 sayNO;
             break;
  
         case EOS: /*  \z  */
-           if (PL_regeol != locinput)
+           if (!NEXTCHR_IS_EOS)
                 sayNO;
             break;
  
         case SANY: /*  /./s  */
-           if (!nextchr && locinput >= PL_regeol)
+           if (NEXTCHR_IS_EOS)
                 sayNO;
              goto increment_locinput;
  
         case CANY: /*  \C  */
-           if (!nextchr && locinput >= PL_regeol)
+           if (NEXTCHR_IS_EOS)
                 sayNO;
             locinput++;
             break;
  
         case REG_ANY: /*  /./  */
-           if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
+           if ((NEXTCHR_IS_EOS) || nextchr == '\n')
                 sayNO;
              goto increment_locinput;
  
@@ -3399,7 +3435,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
              /* In this case the charclass data is available inline so
                 we can fail fast without a lot of extra overhead. 
               */
-            if(!ANYOF_BITMAP_TEST(scan, nextchr)) {
+            if(!NEXTCHR_IS_EOS && !ANYOF_BITMAP_TEST(scan, nextchr)) {
                  DEBUG_EXECUTE_r(
                      PerlIO_printf(Perl_debug_log,
                                "%*s  %sfailed to match trie start class...%s\n",
@@ -3464,7 +3500,9 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
                 HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
                  U32 state = trie->startstate;
  
-                if (trie->bitmap && !TRIE_BITMAP_TEST(trie, nextchr) ) {
+                if (   trie->bitmap
+                    && (NEXTCHR_IS_EOS || !TRIE_BITMAP_TEST(trie, nextchr)))
+                {
                     if (trie->states[ state ].wordnum) {
                          DEBUG_EXECUTE_r(
                              PerlIO_printf(Perl_debug_log,
@@ -3537,7 +3575,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
                     });
  
                     /* read a char and goto next state */
-                   if ( base ) {
+                   if ( base && (foldlen || uc < (U8*)PL_regeol)) {
                         I32 offset;
                         REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
                                              uscan, len, uvc, charid, foldlen,
@@ -3808,6 +3846,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
             locinput += ln;
             break;
             }
+
         case EXACTFL: {          /*  /abc/il      */
             re_fold_t folder;
             const U8 * fold_array;
@@ -3899,12 +3938,17 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
                 }
                 if (FLAGS(scan) != REGEX_LOCALE_CHARSET) {
                     ln = isALNUM_uni(ln);
-                   LOAD_UTF8_CHARCLASS_ALNUM();
-                   n = swash_fetch(PL_utf8_alnum, (U8*)locinput, utf8_target);
+                    if (NEXTCHR_IS_EOS)
+                        n = 0;
+                    else {
+                        LOAD_UTF8_CHARCLASS_ALNUM();
+                        n = swash_fetch(PL_utf8_alnum, (U8*)locinput,
+                                                                utf8_target);
+                    }
                 }
                 else {
                     ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
-                   n = isALNUM_LC_utf8((U8*)locinput);
+                   n = NEXTCHR_IS_EOS ? 0 : isALNUM_LC_utf8((U8*)locinput);
                 }
             }
             else {
@@ -3925,20 +3969,20 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
                 switch (FLAGS(scan)) {
                     case REGEX_UNICODE_CHARSET:
                         ln = isWORDCHAR_L1(ln);
-                       n = isWORDCHAR_L1(nextchr);
+                       n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_L1(nextchr);
                         break;
                     case REGEX_LOCALE_CHARSET:
                         ln = isALNUM_LC(ln);
-                       n = isALNUM_LC(nextchr);
+                       n = NEXTCHR_IS_EOS ? 0 : isALNUM_LC(nextchr);
                         break;
                     case REGEX_DEPENDS_CHARSET:
                         ln = isALNUM(ln);
-                       n = isALNUM(nextchr);
+                       n = NEXTCHR_IS_EOS ? 0 : isALNUM(nextchr);
                         break;
                     case REGEX_ASCII_RESTRICTED_CHARSET:
                     case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
                         ln = isWORDCHAR_A(ln);
-                       n = isWORDCHAR_A(nextchr);
+                       n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_A(nextchr);
                         break;
                     default:
                         Perl_croak(aTHX_ "panic: Unexpected FLAGS %u in op %u", FLAGS(scan), OP(scan));
@@ -3953,19 +3997,16 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
  
         case ANYOFV: /*  /[abx{df}]/i  */
         case ANYOF:  /*  /[abc]/       */
+            if (NEXTCHR_IS_EOS)
+                sayNO;
             if (utf8_target || state_num == ANYOFV) {
                 STRLEN inclasslen = PL_regeol - locinput;
-               if (locinput >= PL_regeol)
-                   sayNO;
-
                 if (!reginclass(rex, scan, (U8*)locinput, &inclasslen, utf8_target))
                     sayNO;
                 locinput += inclasslen;
                 break;
             }
             else {
-               if (!nextchr && locinput >= PL_regeol)
-                   sayNO;
                 if (!REGINCLASS(rex, scan, (U8*)locinput))
                     sayNO;
                 locinput++;
@@ -3993,7 +4034,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
                 digit, "0");
  
          case POSIXA: /* /[[:ascii:]]/ etc */
-            if (locinput >= PL_regeol || ! _generic_isCC_A(nextchr, FLAGS(scan))) {
+            if (NEXTCHR_IS_EOS || ! _generic_isCC_A(nextchr, FLAGS(scan))) {
                  sayNO;
              }
              /* Matched a utf8-invariant, so don't have to worry about utf8 */
@@ -4001,7 +4042,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
              break;
  
          case NPOSIXA: /*  /[^[:ascii:]]/  etc */
-            if (locinput >= PL_regeol || _generic_isCC_A(nextchr, FLAGS(scan))) {
+            if (NEXTCHR_IS_EOS || _generic_isCC_A(nextchr, FLAGS(scan))) {
                  sayNO;
              }
              goto increment_locinput;
@@ -4040,7 +4081,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
                Prepend, that one will be a suitable Begin.
             */
  
-           if (locinput >= PL_regeol)
+           if (NEXTCHR_IS_EOS)
                 sayNO;
             if  (! utf8_target) {
  
@@ -4056,7 +4097,9 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
  
                 /* Utf8: See if is ( CR LF ); already know that locinput <
                  * PL_regeol, so locinput+1 is in bounds */
-               if (nextchr == '\r' && UCHARAT(locinput + 1) == '\n') {
+               if ( nextchr == '\r' && locinput+1 < PL_regeol
+                        && UCHARAT(locinput + 1) == '\n')
+                {
                     locinput += 2;
                 }
                 else {
@@ -4311,7 +4354,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
             }
  
             /* Not utf8:  Inline the first character, for speed. */
-           if (UCHARAT(s) != nextchr &&
+           if (!NEXTCHR_IS_EOS &&
+                UCHARAT(s) != nextchr &&
                 (type == REF ||
                  UCHARAT(s) != fold_array[nextchr]))
                 sayNO;
@@ -5325,7 +5369,8 @@ NULL
                     (int)(REPORT_CODE_OFF+(depth*2)),
                     "", (IV)ST.count)
                 );
-           if (ST.c1 != CHRTEST_VOID
+           if (       !NEXTCHR_IS_EOS
+                    && ST.c1 != CHRTEST_VOID
                     && nextchr != ST.c1
                     && nextchr != ST.c2)
             {
@@ -5379,8 +5424,7 @@ NULL
             if (ST.count == ARG1(ST.me) /* min */)
                 sayNO;
             ST.count--;
-           locinput = HOPc(locinput, -ST.alen);
-            nextchr = UCHARAT(locinput);
+           SET_locinput(HOPc(locinput, -ST.alen));
             goto curlym_do_B; /* try to match B */
  
  #undef ST
@@ -5511,8 +5555,7 @@ NULL
                 minmod = 0;
                 if (ST.min && regrepeat(rex, &li, ST.A, ST.min, depth) < ST.min)
                     sayNO;
-                locinput = li;
-                nextchr = UCHARAT(locinput);
+                SET_locinput(li);
                 ST.count = ST.min;
                 REGCP_SET(ST.cp);
                 if (ST.c1 == CHRTEST_VOID)
@@ -5549,8 +5592,7 @@ NULL
                 ST.count = regrepeat(rex, &li, ST.A, ST.max, depth);
                 if (ST.count < ST.min)
                     sayNO;
-                locinput = li;
-                nextchr = UCHARAT(locinput);
+                SET_locinput(li);
                 if ((ST.count > ST.min)
                     && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
                 {
@@ -5690,12 +5732,15 @@ NULL
              }
             {
                 UV c = 0;
-               if (ST.c1 != CHRTEST_VOID)
+               if (ST.c1 != CHRTEST_VOID && locinput < PL_regeol)
                     c = utf8_target ? utf8n_to_uvchr((U8*)locinput,
                                            UTF8_MAXBYTES, 0, uniflags)
                                 : (UV) UCHARAT(locinput);
                 /* If it could work, try it. */
-               if (ST.c1 == CHRTEST_VOID || c == (UV)ST.c1 || c == (UV)ST.c2) {
+               if (ST.c1 == CHRTEST_VOID
+                    || (locinput < PL_regeol &&
+                        (c == (UV)ST.c1 || c == (UV)ST.c2)))
+                {
                     CURLY_SETPAREN(ST.paren, ST.count);
                     PUSH_STATE_GOTO(CURLY_B_max, ST.B, locinput);
                     assert(0); /* NOTREACHED */
@@ -5949,7 +5994,7 @@ NULL
  #undef ST
  
          case LNBREAK: /* \R */
-            if ((n=is_LNBREAK(locinput,utf8_target))) {
+            if ((n=is_LNBREAK_safe(locinput, PL_regeol, utf8_target))) {
                  locinput += n;
              } else
                  sayNO;
@@ -5957,7 +6002,7 @@ NULL
  
  #define CASE_CLASS(nAmE)                              \
          case nAmE:                                    \
-           if (locinput >= PL_regeol)                \
+           if (NEXTCHR_IS_EOS)                       \
                 sayNO;                                \
              if ((n=is_##nAmE(locinput,utf8_target))) {    \
                  locinput += n;                        \
@@ -5965,7 +6010,7 @@ NULL
                  sayNO;                                \
              break;                                    \
          case N##nAmE:                                 \
-           if (locinput >= PL_regeol)                \
+           if (NEXTCHR_IS_EOS)                       \
                 sayNO;                                \
              if ((n=is_##nAmE(locinput,utf8_target))) {    \
                  sayNO;                                \
@@ -5988,6 +6033,7 @@ NULL
          increment_locinput:
              if (utf8_target) {
                  locinput += PL_utf8skip[nextchr];
+                /* locinput is allowed to go 1 char off the end, but not 2+ */
                  if (locinput > PL_regeol)
                      sayNO;
              }
@@ -6654,7 +6700,8 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma
      case LNBREAK:
          if (utf8_target) {
             loceol = PL_regeol;
-           while (hardcount < max && scan < loceol && (c=is_LNBREAK_utf8(scan))) {
+           while (hardcount < max && scan < loceol &&
+                    (c=is_LNBREAK_utf8_safe(scan, loceol))) {
                 scan += c;
                 hardcount++;
             }
@@ -6664,7 +6711,7 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma
               because we have a null terminated string, but we
               have to use hardcount in this situation
             */
-           while (scan < loceol && (c=is_LNBREAK_latin1(scan)))  {
+           while (scan < loceol && (c=is_LNBREAK_latin1_safe(scan, loceol))) {
                 scan+=c;
                 hardcount++;
             }
@@ -6673,24 +6720,28 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma
      case HORIZWS:
          if (utf8_target) {
             loceol = PL_regeol;
-           while (hardcount < max && scan < loceol && (c=is_HORIZWS_utf8(scan))) {
+           while (hardcount < max && scan < loceol &&
+                    (c=is_HORIZWS_utf8_safe(scan, loceol)))
+            {
                 scan += c;
                 hardcount++;
             }
         } else {
-           while (scan < loceol && is_HORIZWS_latin1(scan)) 
+           while (scan < loceol && is_HORIZWS_latin1_safe(scan, loceol)) 
                 scan++;         
         }       
         break;
      case NHORIZWS:
          if (utf8_target) {
             loceol = PL_regeol;
-           while (hardcount < max && scan < loceol && !is_HORIZWS_utf8(scan)) {
+           while (hardcount < max && scan < loceol &&
+                        !is_HORIZWS_utf8_safe(scan, loceol))
+            {
                 scan += UTF8SKIP(scan);
                 hardcount++;
             }
         } else {
-           while (scan < loceol && !is_HORIZWS_latin1(scan))
+           while (scan < loceol && !is_HORIZWS_latin1_safe(scan, loceol))
                 scan++;
  
         }       
@@ -6698,12 +6749,14 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma
      case VERTWS:
          if (utf8_target) {
             loceol = PL_regeol;
-           while (hardcount < max && scan < loceol && (c=is_VERTWS_utf8(scan))) {
+           while (hardcount < max && scan < loceol &&
+                            (c=is_VERTWS_utf8_safe(scan, loceol)))
+            {
                 scan += c;
                 hardcount++;
             }
         } else {
-           while (scan < loceol && is_VERTWS_latin1(scan)) 
+           while (scan < loceol && is_VERTWS_latin1_safe(scan, loceol)) 
                 scan++;
  
         }       
@@ -6711,12 +6764,14 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma
      case NVERTWS:
          if (utf8_target) {
             loceol = PL_regeol;
-           while (hardcount < max && scan < loceol && !is_VERTWS_utf8(scan)) {
+           while (hardcount < max && scan < loceol &&
+                                !is_VERTWS_utf8_safe(scan, loceol))
+            {
                 scan += UTF8SKIP(scan);
                 hardcount++;
             }
         } else {
-           while (scan < loceol && !is_VERTWS_latin1(scan)) 
+           while (scan < loceol && !is_VERTWS_latin1_safe(scan, loceol)) 
                 scan++;
            
         }
author	David Mitchell <davem@iabyn.com>
	Fri, 21 Sep 2012 09:29:04 +0000 (10:29 +0100)
committer	David Mitchell <davem@iabyn.com>
	Wed, 26 Sep 2012 08:41:10 +0000 (09:41 +0100)
MANIFEST		patch \| blob \| history
ext/XS-APItest/APItest.pm		patch \| blob \| history
ext/XS-APItest/APItest.xs		patch \| blob \| history
ext/XS-APItest/t/callregexec.t	[new file with mode: 0644]	patch \| blob
regexec.c		patch \| blob \| history