From 7615b8c36db26d7f3731eb4bdac2e85ba4984413 Mon Sep 17 00:00:00 2001 From: DongHun Kwak Date: Wed, 5 Dec 2018 10:36:21 +0900 Subject: [PATCH] Imported Upstream version 20181201 Change-Id: I4e012361ab9520e396e02cc612d950a1698f0be3 Signed-off-by: DongHun Kwak --- re2/fuzzing/re2_fuzzer.cc | 35 +++++++++++++++++++++++-- re2/nfa.cc | 66 ++++++++++++----------------------------------- re2/prefilter.cc | 11 ++++---- re2/prefilter_tree.cc | 1 + re2/re2.cc | 28 +++++++++++++++++--- re2/re2.h | 19 +++++++++++--- re2/testing/re2_test.cc | 21 +++++++++++++++ util/pcre.h | 4 +-- 8 files changed, 120 insertions(+), 65 deletions(-) diff --git a/re2/fuzzing/re2_fuzzer.cc b/re2/fuzzing/re2_fuzzer.cc index 3ce4d1b..28ccaf4 100644 --- a/re2/fuzzing/re2_fuzzer.cc +++ b/re2/fuzzing/re2_fuzzer.cc @@ -5,8 +5,11 @@ #include #include #include +#include +#include #include +#include "re2/prefilter.h" #include "re2/re2.h" using re2::StringPiece; @@ -21,17 +24,45 @@ void Test(StringPiece pattern, const RE2::Options& options, StringPiece text) { return; // Don't waste time fuzzing high-size programs. - // (They can cause bug reports due to fuzzer timeouts.) + // They can cause bug reports due to fuzzer timeouts. int size = re.ProgramSize(); if (size > 9999) return; + int rsize = re.ReverseProgramSize(); + if (rsize > 9999) + return; // Don't waste time fuzzing high-fanout programs. - // (They can also cause bug reports due to fuzzer timeouts.) + // They can cause bug reports due to fuzzer timeouts. std::map histogram; int fanout = re.ProgramFanout(&histogram); if (fanout > 9) return; + int rfanout = re.ReverseProgramFanout(&histogram); + if (rfanout > 9) + return; + + // Don't waste time fuzzing programs with large substrings. + // They can cause bug reports due to fuzzer timeouts when they + // are repetitions (e.g. hundreds of NUL bytes) and matching is + // unanchored. And they aren't interesting for fuzzing purposes. + std::unique_ptr prefilter(re2::Prefilter::FromRE2(&re)); + if (prefilter == nullptr) + return; + std::queue nodes; + nodes.push(prefilter.get()); + while (!nodes.empty()) { + re2::Prefilter* node = nodes.front(); + nodes.pop(); + if (node->op() == re2::Prefilter::ATOM) { + if (node->atom().size() > 9) + return; + } else if (node->op() == re2::Prefilter::AND || + node->op() == re2::Prefilter::OR) { + for (re2::Prefilter* sub : *node->subs()) + nodes.push(sub); + } + } StringPiece sp1, sp2, sp3, sp4; string s1, s2, s3, s4; diff --git a/re2/nfa.cc b/re2/nfa.cc index 1a3e0af..b2d5c0a 100644 --- a/re2/nfa.cc +++ b/re2/nfa.cc @@ -95,20 +95,20 @@ class NFA { // Follows all empty arrows from id0 and enqueues all the states reached. // Enqueues only the ByteRange instructions that match byte c. - // The bits in flag (Bol, Eol, etc.) specify whether ^, $ and \b match. + // context is used (with p) for evaluating empty-width specials. // p is the current input position, and t0 is the current thread. - void AddToThreadq(Threadq* q, int id0, int c, int flag, + void AddToThreadq(Threadq* q, int id0, int c, const StringPiece& context, const char* p, Thread* t0); // Run runq on byte c, appending new states to nextq. // Updates matched_ and match_ as new, better matches are found. + // context is used (with p) for evaluating empty-width specials. // p is the position of byte c in the input string for AddToThreadq; // p-1 will be used when processing Match instructions. - // flag is the bitwise OR of Bol, Eol, etc., specifying whether - // ^, $ and \b match the current input position (after c). // Frees all the threads on runq. // If there is a shortcut to the end, returns that shortcut. - inline int Step(Threadq* runq, Threadq* nextq, int c, int flag, const char* p); + int Step(Threadq* runq, Threadq* nextq, int c, const StringPiece& context, + const char* p); // Returns text version of capture information, for debugging. string FormatCapture(const char** capture); @@ -204,9 +204,9 @@ void NFA::CopyCapture(const char** dst, const char** src) { // Follows all empty arrows from id0 and enqueues all the states reached. // Enqueues only the ByteRange instructions that match byte c. -// The bits in flag (Bol, Eol, etc.) specify whether ^, $ and \b match. +// context is used (with p) for evaluating empty-width specials. // p is the current input position, and t0 is the current thread. -void NFA::AddToThreadq(Threadq* q, int id0, int c, int flag, +void NFA::AddToThreadq(Threadq* q, int id0, int c, const StringPiece& context, const char* p, Thread* t0) { if (id0 == 0) return; @@ -318,7 +318,7 @@ void NFA::AddToThreadq(Threadq* q, int id0, int c, int flag, stk[nstk++] = AddState(id+1); // Continue on if we have all the right flag bits. - if (ip->empty() & ~flag) + if (ip->empty() & ~Prog::EmptyFlags(context, p)) break; a = AddState(ip->out()); goto Loop; @@ -328,13 +328,13 @@ void NFA::AddToThreadq(Threadq* q, int id0, int c, int flag, // Run runq on byte c, appending new states to nextq. // Updates matched_ and match_ as new, better matches are found. +// context is used (with p) for evaluating empty-width specials. // p is the position of byte c in the input string for AddToThreadq; // p-1 will be used when processing Match instructions. -// flag is the bitwise OR of Bol, Eol, etc., specifying whether -// ^, $ and \b match the current input position (after c). // Frees all the threads on runq. // If there is a shortcut to the end, returns that shortcut. -int NFA::Step(Threadq* runq, Threadq* nextq, int c, int flag, const char* p) { +int NFA::Step(Threadq* runq, Threadq* nextq, int c, const StringPiece& context, + const char* p) { nextq->clear(); for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i) { @@ -360,7 +360,7 @@ int NFA::Step(Threadq* runq, Threadq* nextq, int c, int flag, const char* p) { break; case kInstByteRange: - AddToThreadq(nextq, ip->out(), c, flag, p, t); + AddToThreadq(nextq, ip->out(), c, context, p, t); break; case kInstAltMatch: @@ -500,38 +500,9 @@ bool NFA::Search(const StringPiece& text, const StringPiece& const_context, runq->clear(); nextq->clear(); memset(&match_[0], 0, ncapture_*sizeof match_[0]); - int wasword = 0; - - if (text.begin() > context.begin()) - wasword = Prog::IsWordChar(text.begin()[-1] & 0xFF); // Loop over the text, stepping the machine. for (const char* p = text.begin();; p++) { - // Check for empty-width specials. - int flag = 0; - - // ^ and \A - if (p == context.begin()) - flag |= kEmptyBeginText | kEmptyBeginLine; - else if (p <= context.end() && p[-1] == '\n') - flag |= kEmptyBeginLine; - - // $ and \z - if (p == context.end()) - flag |= kEmptyEndText | kEmptyEndLine; - else if (p < context.end() && p[0] == '\n') - flag |= kEmptyEndLine; - - // \b and \B - int isword = 0; - if (p < context.end()) - isword = Prog::IsWordChar(p[0] & 0xFF); - - if (isword != wasword) - flag |= kEmptyWordBoundary; - else - flag |= kEmptyNonWordBoundary; - if (ExtraDebug) { int c = 0; if (p == context.begin()) @@ -541,7 +512,7 @@ bool NFA::Search(const StringPiece& text, const StringPiece& const_context, else if (p < text.end()) c = p[0] & 0xFF; - fprintf(stderr, "%c[%#x/%d/%d]:", c, flag, isword, wasword); + fprintf(stderr, "%c:", c); for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i) { Thread* t = i->second; if (t == NULL) @@ -552,7 +523,7 @@ bool NFA::Search(const StringPiece& text, const StringPiece& const_context, } // This is a no-op the first time around the loop because runq is empty. - int id = Step(runq, nextq, p < text.end() ? p[0] & 0xFF : -1, flag, p); + int id = Step(runq, nextq, p < text.end() ? p[0] & 0xFF : -1, context, p); DCHECK_EQ(runq->size(), 0); using std::swap; swap(nextq, runq); @@ -604,17 +575,14 @@ bool NFA::Search(const StringPiece& text, const StringPiece& const_context, p = reinterpret_cast(memchr(p, fb, text.end() - p)); if (p == NULL) { p = text.end(); - isword = 0; - } else { - isword = Prog::IsWordChar(p[0] & 0xFF); } - flag = Prog::EmptyFlags(context, p); } Thread* t = AllocThread(); CopyCapture(t->capture, match_); t->capture[0] = p; - AddToThreadq(runq, start_, p < text.end() ? p[0] & 0xFF : -1, flag, p, t); + AddToThreadq(runq, start_, p < text.end() ? p[0] & 0xFF : -1, context, p, + t); Decref(t); } @@ -624,8 +592,6 @@ bool NFA::Search(const StringPiece& text, const StringPiece& const_context, fprintf(stderr, "dead\n"); break; } - - wasword = isword; } for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i) diff --git a/re2/prefilter.cc b/re2/prefilter.cc index e34aaf0..b657357 100644 --- a/re2/prefilter.cc +++ b/re2/prefilter.cc @@ -214,7 +214,7 @@ class Prefilter::Info { static Info* Quest(Info* a); static Info* EmptyString(); static Info* NoMatch(); - static Info* AnyChar(); + static Info* AnyCharOrAnyByte(); static Info* CClass(CharClass* cc, bool latin1); static Info* Literal(Rune r); static Info* LiteralLatin1(Rune r); @@ -417,8 +417,8 @@ Prefilter::Info* Prefilter::Info::LiteralLatin1(Rune r) { return info; } -// Constructs Info for dot (any character). -Prefilter::Info* Prefilter::Info::AnyChar() { +// Constructs Info for dot (any character) or \C (any byte). +Prefilter::Info* Prefilter::Info::AnyCharOrAnyByte() { Prefilter::Info* info = new Prefilter::Info(); info->match_ = new Prefilter(ALL); return info; @@ -461,7 +461,7 @@ Prefilter::Info* Prefilter::Info::CClass(CharClass *cc, // If the class is too large, it's okay to overestimate. if (cc->size() > 10) - return AnyChar(); + return AnyCharOrAnyByte(); Prefilter::Info *a = new Prefilter::Info(); for (CCIter i = cc->begin(); i != cc->end(); ++i) @@ -622,8 +622,9 @@ Prefilter::Info* Prefilter::Info::Walker::PostVisit( break; case kRegexpAnyChar: + case kRegexpAnyByte: // Claim nothing, except that it's not empty. - info = AnyChar(); + info = AnyCharOrAnyByte(); break; case kRegexpCharClass: diff --git a/re2/prefilter_tree.cc b/re2/prefilter_tree.cc index 376c6cd..a07de40 100644 --- a/re2/prefilter_tree.cc +++ b/re2/prefilter_tree.cc @@ -138,6 +138,7 @@ bool PrefilterTree::KeepNode(Prefilter* node) const { return false; case Prefilter::ALL: + case Prefilter::NONE: return false; case Prefilter::ATOM: diff --git a/re2/re2.cc b/re2/re2.cc index 89c67ec..f3f96ae 100644 --- a/re2/re2.cc +++ b/re2/re2.cc @@ -262,11 +262,18 @@ int RE2::ProgramSize() const { return prog_->size(); } -int RE2::ProgramFanout(std::map* histogram) const { +int RE2::ReverseProgramSize() const { if (prog_ == NULL) return -1; - SparseArray fanout(prog_->size()); - prog_->Fanout(&fanout); + Prog* prog = ReverseProg(); + if (prog == NULL) + return -1; + return prog->size(); +} + +static int Fanout(Prog* prog, std::map* histogram) { + SparseArray fanout(prog->size()); + prog->Fanout(&fanout); histogram->clear(); for (SparseArray::iterator i = fanout.begin(); i != fanout.end(); ++i) { // TODO(junyer): Optimise this? @@ -279,6 +286,21 @@ int RE2::ProgramFanout(std::map* histogram) const { return histogram->rbegin()->first; } +int RE2::ProgramFanout(std::map* histogram) const { + if (prog_ == NULL) + return -1; + return Fanout(prog_, histogram); +} + +int RE2::ReverseProgramFanout(std::map* histogram) const { + if (prog_ == NULL) + return -1; + Prog* prog = ReverseProg(); + if (prog == NULL) + return -1; + return Fanout(prog, histogram); +} + // Returns num_captures_, computing it if needed, or -1 if the // regexp wasn't valid on construction. int RE2::NumberOfCapturingGroups() const { diff --git a/re2/re2.h b/re2/re2.h index 2a012e1..c93de56 100644 --- a/re2/re2.h +++ b/re2/re2.h @@ -53,9 +53,19 @@ // CHECK(RE2::FullMatch(latin1_string, RE2(latin1_pattern, RE2::Latin1))); // // ----------------------------------------------------------------------- -// MATCHING WITH SUB-STRING EXTRACTION: -// -// You can supply extra pointer arguments to extract matched subpieces. +// MATCHING WITH SUBSTRING EXTRACTION: +// +// You can supply extra pointer arguments to extract matched substrings. +// On match failure, none of the pointees will have been modified. +// On match success, the substrings will be converted (as necessary) and +// their values will be assigned to their pointees until all conversions +// have succeeded or one conversion has failed. +// On conversion failure, the pointees will be in an indeterminate state +// because the caller has no way of knowing which conversion failed. +// However, conversion cannot fail for types like string and StringPiece +// that do not inspect the substring contents. Hence, in the common case +// where all of the pointees are of such types, failure is always due to +// match failure and thus none of the pointees will have been modified. // // Example: extracts "ruby" into "s" and 1234 into "i" // int i; @@ -278,11 +288,13 @@ class RE2 { // Returns the program size, a very approximate measure of a regexp's "cost". // Larger numbers are more expensive than smaller numbers. int ProgramSize() const; + int ReverseProgramSize() const; // EXPERIMENTAL! SUBJECT TO CHANGE! // Outputs the program fanout as a histogram bucketed by powers of 2. // Returns the number of the largest non-empty bucket. int ProgramFanout(std::map* histogram) const; + int ReverseProgramFanout(std::map* histogram) const; // Returns the underlying Regexp; not for general use. // Returns entire_regexp_ so that callers don't need @@ -489,6 +501,7 @@ class RE2 { // I.e. matching RE2("(foo)|(bar)baz") on "barbazbla" will return true, with // submatch[0] = "barbaz", submatch[1].data() = NULL, submatch[2] = "bar", // submatch[3].data() = NULL, ..., up to submatch[nsubmatch-1].data() = NULL. + // Caveat: submatch[] may be clobbered even on match failure. // // Don't ask for more match information than you will use: // runs much faster with nsubmatch == 1 than nsubmatch > 1, and diff --git a/re2/testing/re2_test.cc b/re2/testing/re2_test.cc index b847325..cae956c 100644 --- a/re2/testing/re2_test.cc +++ b/re2/testing/re2_test.cc @@ -461,6 +461,10 @@ TEST(ProgramSize, BigProgram) { ASSERT_GT(re_simple.ProgramSize(), 0); ASSERT_GT(re_medium.ProgramSize(), re_simple.ProgramSize()); ASSERT_GT(re_complex.ProgramSize(), re_medium.ProgramSize()); + + ASSERT_GT(re_simple.ReverseProgramSize(), 0); + ASSERT_GT(re_medium.ReverseProgramSize(), re_simple.ReverseProgramSize()); + ASSERT_GT(re_complex.ReverseProgramSize(), re_medium.ReverseProgramSize()); } TEST(ProgramFanout, BigProgram) { @@ -486,6 +490,23 @@ TEST(ProgramFanout, BigProgram) { // 13 is the largest non-empty bucket and has 1000 elements. ASSERT_EQ(13, re1000.ProgramFanout(&histogram)); ASSERT_EQ(1000, histogram[13]); + + // 2 is the largest non-empty bucket and has 3 elements. + // This differs from the others due to how reverse `.' works. + ASSERT_EQ(2, re1.ReverseProgramFanout(&histogram)); + ASSERT_EQ(3, histogram[2]); + + // 5 is the largest non-empty bucket and has 10 elements. + ASSERT_EQ(5, re10.ReverseProgramFanout(&histogram)); + ASSERT_EQ(10, histogram[5]); + + // 9 is the largest non-empty bucket and has 100 elements. + ASSERT_EQ(9, re100.ReverseProgramFanout(&histogram)); + ASSERT_EQ(100, histogram[9]); + + // 12 is the largest non-empty bucket and has 1000 elements. + ASSERT_EQ(12, re1000.ReverseProgramFanout(&histogram)); + ASSERT_EQ(1000, histogram[12]); } // Issue 956519: handling empty character sets was diff --git a/util/pcre.h b/util/pcre.h index 7c6403d..10ec4f2 100644 --- a/util/pcre.h +++ b/util/pcre.h @@ -61,9 +61,9 @@ // CHECK(PCRE::FullMatch(utf8_string, re)); // // ----------------------------------------------------------------------- -// MATCHING WITH SUB-STRING EXTRACTION: +// MATCHING WITH SUBSTRING EXTRACTION: // -// You can supply extra pointer arguments to extract matched subpieces. +// You can supply extra pointer arguments to extract matched substrings. // // Example: extracts "ruby" into "s" and 1234 into "i" // int i; -- 2.7.4