[perl #119505] Segfault from bad backreference
authorDavid Mitchell <davem@iabyn.com>
Wed, 16 Oct 2013 12:59:12 +0000 (13:59 +0100)
committerDavid Mitchell <davem@iabyn.com>
Thu, 17 Oct 2013 09:57:35 +0000 (10:57 +0100)
The code that parses regex backrefs (or ambiguous backref/octal) such as
\123, did a simple atoi(), which could wrap round to negative values on
long digit strings and cause seg faults.

Include a check on the length of the digit string, and if greater than 9
digits, assume it can never be a valid backref (obviating the need for the
atoi() call).

I've also simplified the code a bit, putting most of the \g handling code
into a single block, rather than doing multiple "if (isg) {...}".

regcomp.c
t/re/re_tests

index 0ec6a76..317094c 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -10594,6 +10594,22 @@ S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state, regnode *node, I32
     }
 }
 
+
+/* return atoi(p), unless it's too big to sensibly be a backref,
+ * in which case return I32_MAX (rather than possibly 32-bit wrapping) */
+
+static I32
+S_backref_value(char *p)
+{
+    char *q = p;
+
+    for (;isDIGIT(*q); q++); /* calculate length of num */
+    if (q - p == 0 || q - p > 9)
+        return I32_MAX;
+    return atoi(p);
+}
+
+
 /*
  - regatom - the lowest level
 
@@ -11027,10 +11043,11 @@ tryagain:
        case '5': case '6': case '7': case '8': case '9':
            {
                I32 num;
-               bool isg = *RExC_parse == 'g';
-               bool isrel = 0; 
                bool hasbrace = 0;
-               if (isg) {
+
+               if (*RExC_parse == 'g') {
+                    bool isrel = 0;
+
                    RExC_parse++;
                    if (*RExC_parse == '{') {
                        RExC_parse++;
@@ -11044,25 +11061,36 @@ tryagain:
                        if (isrel) RExC_parse--;
                         RExC_parse -= 2;                           
                        goto parse_named_seq;
-               }   }
-               num = atoi(RExC_parse);
-               if (isg && num == 0) {
-                   if (*RExC_parse == '0') {
+                    }
+
+                    num = S_backref_value(RExC_parse);
+                    if (num == 0)
                         vFAIL("Reference to invalid group 0");
+                    else if (num == I32_MAX) {
+                         if (isDIGIT(*RExC_parse))
+                           vFAIL("Reference to nonexistent group");
+                        else
+                            vFAIL("Unterminated \\g... pattern");
                     }
-                    else {
-                       vFAIL("Unterminated \\g... pattern");
+
+                    if (isrel) {
+                        num = RExC_npar - num;
+                        if (num < 1)
+                            vFAIL("Reference to nonexistent or unclosed group");
                     }
                 }
-                if (isrel) {
-                    num = RExC_npar - num;
-                    if (num < 1)
-                        vFAIL("Reference to nonexistent or unclosed group");
+                else {
+                    num = S_backref_value(RExC_parse);
+                    /* bare \NNN might be backref or octal */
+                    if (num == I32_MAX || (num > 9 && num >= RExC_npar
+                            && *RExC_parse != '8' && *RExC_parse != '9'))
+                        /* Probably a character specified in octal, e.g. \35 */
+                        goto defchar;
                 }
-                if (!isg && num > 9 && num >= RExC_npar && *RExC_parse != '8' && *RExC_parse != '9')
-                    /* Probably a character specified in octal, e.g. \35 */
-                   goto defchar;
-               else {
+
+                /* at this point RExC_parse definitely points to a backref
+                 * number */
+               {
 #ifdef RE_TRACK_PATTERN_OFFSETS
                    char * const parse_start = RExC_parse - 1; /* MJD */
 #endif
@@ -11358,7 +11386,7 @@ tryagain:
                          * 118 OR as "\11" . "8" depending on whether there
                          * were 118 capture buffers defined already in the
                          * pattern.  */
-                        if ( !isDIGIT(p[1]) || atoi(p) <= RExC_npar )
+                        if ( !isDIGIT(p[1]) || S_backref_value(p) <= RExC_npar)
                         {  /* Not to be treated as an octal constant, go
                                    find backref */
                             --p;
index cf3291d..4ea072a 100644 (file)
@@ -1493,6 +1493,47 @@ abc\N{def        -       c       -       \\N{NAME} must be resolved by the lexer
 a\87   a87     c       -       Reference to nonexistent group in regex
 a\97   a97     c       -       Reference to nonexistent group in regex
 
+# avoid problems with 32-bit signed integer overflow
+
+(.)\g2147483648}       x       c       -       Reference to nonexistent group in regex
+(.)\g2147483649}       x       c       -       Reference to nonexistent group in regex
+(.)\g2147483650}       x       c       -       Reference to nonexistent group in regex
+(.)\g4294967296}       x       c       -       Reference to nonexistent group in regex
+(.)\g4294967297}       x       c       -       Reference to nonexistent group in regex
+(.)\g4294967298}       x       c       -       Reference to nonexistent group in regex
+a(.)\g2147483648}      x       c       -       Reference to nonexistent group in regex
+a(.)\g2147483649}      x       c       -       Reference to nonexistent group in regex
+a(.)\g2147483650}      x       c       -       Reference to nonexistent group in regex
+a(.)\g4294967296}      x       c       -       Reference to nonexistent group in regex
+a(.)\g4294967297}      x       c       -       Reference to nonexistent group in regex
+a(.)\g4294967298}      x       c       -       Reference to nonexistent group in regex
+
+(.)\g{2147483648}      x       c       -       Reference to nonexistent group in regex
+(.)\g{2147483649}      x       c       -       Reference to nonexistent group in regex
+(.)\g{2147483650}      x       c       -       Reference to nonexistent group in regex
+(.)\g{4294967296}      x       c       -       Reference to nonexistent group in regex
+(.)\g{4294967297}      x       c       -       Reference to nonexistent group in regex
+(.)\g{4294967298}      x       c       -       Reference to nonexistent group in regex
+a(.)\g{2147483648}     x       c       -       Reference to nonexistent group in regex
+a(.)\g{2147483649}     x       c       -       Reference to nonexistent group in regex
+a(.)\g{2147483650}     x       c       -       Reference to nonexistent group in regex
+a(.)\g{4294967296}     x       c       -       Reference to nonexistent group in regex
+a(.)\g{4294967297}     x       c       -       Reference to nonexistent group in regex
+a(.)\g{4294967298}     x       c       -       Reference to nonexistent group in regex
+
+(.)\2147483648 b\o{214}7483648 y       $1      b
+(.)\2147483649 b\o{214}7483649 y       $1      b
+(.)\2147483650 b\o{214}7483650 y       $1      b
+(.)\4294967296 b\o{42}94967296 y       $1      b
+(.)\4294967297 b\o{42}94967297 y       $1      b
+(.)\4294967298 b\o{42}94967298 y       $1      b
+a(.)\2147483648        ab\o{214}7483648        y       $1      b
+a(.)\2147483649        ab\o{214}7483649        y       $1      b
+a(.)\2147483650        ab\o{214}7483650        y       $1      b
+a(.)\4294967296        ab\o{42}94967296        y       $1      b
+a(.)\4294967297        ab\o{42}94967297        y       $1      b
+a(.)\4294967298        ab\o{42}94967298        y       $1      b
+
 # The below was inserting a NULL into the character class.
 [\8\9] \000    Sn      -       -
 [\8\9] -       sc      $&      Unrecognized escape \\8 in character class