From 0c2990d652e985784f095bba4bc356481a66aa06 Mon Sep 17 00:00:00 2001 From: David Mitchell Date: Wed, 16 Oct 2013 13:59:12 +0100 Subject: [PATCH] [perl #119505] Segfault from bad backreference The code that parses regex backrefs (or ambiguous backref/octal) such as \123, did a simple atoi(), which could wrap round to negative values on long digit strings and cause seg faults. Include a check on the length of the digit string, and if greater than 9 digits, assume it can never be a valid backref (obviating the need for the atoi() call). I've also simplified the code a bit, putting most of the \g handling code into a single block, rather than doing multiple "if (isg) {...}". --- regcomp.c | 64 ++++++++++++++++++++++++++++++++++++++++++----------------- t/re/re_tests | 41 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 87 insertions(+), 18 deletions(-) diff --git a/regcomp.c b/regcomp.c index 0ec6a76..317094c 100644 --- a/regcomp.c +++ b/regcomp.c @@ -10594,6 +10594,22 @@ S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state, regnode *node, I32 } } + +/* return atoi(p), unless it's too big to sensibly be a backref, + * in which case return I32_MAX (rather than possibly 32-bit wrapping) */ + +static I32 +S_backref_value(char *p) +{ + char *q = p; + + for (;isDIGIT(*q); q++); /* calculate length of num */ + if (q - p == 0 || q - p > 9) + return I32_MAX; + return atoi(p); +} + + /* - regatom - the lowest level @@ -11027,10 +11043,11 @@ tryagain: case '5': case '6': case '7': case '8': case '9': { I32 num; - bool isg = *RExC_parse == 'g'; - bool isrel = 0; bool hasbrace = 0; - if (isg) { + + if (*RExC_parse == 'g') { + bool isrel = 0; + RExC_parse++; if (*RExC_parse == '{') { RExC_parse++; @@ -11044,25 +11061,36 @@ tryagain: if (isrel) RExC_parse--; RExC_parse -= 2; goto parse_named_seq; - } } - num = atoi(RExC_parse); - if (isg && num == 0) { - if (*RExC_parse == '0') { + } + + num = S_backref_value(RExC_parse); + if (num == 0) vFAIL("Reference to invalid group 0"); + else if (num == I32_MAX) { + if (isDIGIT(*RExC_parse)) + vFAIL("Reference to nonexistent group"); + else + vFAIL("Unterminated \\g... pattern"); } - else { - vFAIL("Unterminated \\g... pattern"); + + if (isrel) { + num = RExC_npar - num; + if (num < 1) + vFAIL("Reference to nonexistent or unclosed group"); } } - if (isrel) { - num = RExC_npar - num; - if (num < 1) - vFAIL("Reference to nonexistent or unclosed group"); + else { + num = S_backref_value(RExC_parse); + /* bare \NNN might be backref or octal */ + if (num == I32_MAX || (num > 9 && num >= RExC_npar + && *RExC_parse != '8' && *RExC_parse != '9')) + /* Probably a character specified in octal, e.g. \35 */ + goto defchar; } - if (!isg && num > 9 && num >= RExC_npar && *RExC_parse != '8' && *RExC_parse != '9') - /* Probably a character specified in octal, e.g. \35 */ - goto defchar; - else { + + /* at this point RExC_parse definitely points to a backref + * number */ + { #ifdef RE_TRACK_PATTERN_OFFSETS char * const parse_start = RExC_parse - 1; /* MJD */ #endif @@ -11358,7 +11386,7 @@ tryagain: * 118 OR as "\11" . "8" depending on whether there * were 118 capture buffers defined already in the * pattern. */ - if ( !isDIGIT(p[1]) || atoi(p) <= RExC_npar ) + if ( !isDIGIT(p[1]) || S_backref_value(p) <= RExC_npar) { /* Not to be treated as an octal constant, go find backref */ --p; diff --git a/t/re/re_tests b/t/re/re_tests index cf3291d..4ea072a 100644 --- a/t/re/re_tests +++ b/t/re/re_tests @@ -1493,6 +1493,47 @@ abc\N{def - c - \\N{NAME} must be resolved by the lexer a\87 a87 c - Reference to nonexistent group in regex a\97 a97 c - Reference to nonexistent group in regex +# avoid problems with 32-bit signed integer overflow + +(.)\g2147483648} x c - Reference to nonexistent group in regex +(.)\g2147483649} x c - Reference to nonexistent group in regex +(.)\g2147483650} x c - Reference to nonexistent group in regex +(.)\g4294967296} x c - Reference to nonexistent group in regex +(.)\g4294967297} x c - Reference to nonexistent group in regex +(.)\g4294967298} x c - Reference to nonexistent group in regex +a(.)\g2147483648} x c - Reference to nonexistent group in regex +a(.)\g2147483649} x c - Reference to nonexistent group in regex +a(.)\g2147483650} x c - Reference to nonexistent group in regex +a(.)\g4294967296} x c - Reference to nonexistent group in regex +a(.)\g4294967297} x c - Reference to nonexistent group in regex +a(.)\g4294967298} x c - Reference to nonexistent group in regex + +(.)\g{2147483648} x c - Reference to nonexistent group in regex +(.)\g{2147483649} x c - Reference to nonexistent group in regex +(.)\g{2147483650} x c - Reference to nonexistent group in regex +(.)\g{4294967296} x c - Reference to nonexistent group in regex +(.)\g{4294967297} x c - Reference to nonexistent group in regex +(.)\g{4294967298} x c - Reference to nonexistent group in regex +a(.)\g{2147483648} x c - Reference to nonexistent group in regex +a(.)\g{2147483649} x c - Reference to nonexistent group in regex +a(.)\g{2147483650} x c - Reference to nonexistent group in regex +a(.)\g{4294967296} x c - Reference to nonexistent group in regex +a(.)\g{4294967297} x c - Reference to nonexistent group in regex +a(.)\g{4294967298} x c - Reference to nonexistent group in regex + +(.)\2147483648 b\o{214}7483648 y $1 b +(.)\2147483649 b\o{214}7483649 y $1 b +(.)\2147483650 b\o{214}7483650 y $1 b +(.)\4294967296 b\o{42}94967296 y $1 b +(.)\4294967297 b\o{42}94967297 y $1 b +(.)\4294967298 b\o{42}94967298 y $1 b +a(.)\2147483648 ab\o{214}7483648 y $1 b +a(.)\2147483649 ab\o{214}7483649 y $1 b +a(.)\2147483650 ab\o{214}7483650 y $1 b +a(.)\4294967296 ab\o{42}94967296 y $1 b +a(.)\4294967297 ab\o{42}94967297 y $1 b +a(.)\4294967298 ab\o{42}94967298 y $1 b + # The below was inserting a NULL into the character class. [\8\9] \000 Sn - - [\8\9] - sc $& Unrecognized escape \\8 in character class -- 2.7.4