1 // Copyright 2009 The RE2 Authors. All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
6 #include "re2/prefilter.h"
8 #include "re2/unicode_casefold.h"
9 #include "re2/walker-inl.h"
13 static const int Trace = false;
15 typedef set<string>::iterator SSIter;
16 typedef set<string>::const_iterator ConstSSIter;
18 static int alloc_id = 100000; // Used for debugging.
19 // Initializes a Prefilter, allocating subs_ as necessary.
20 Prefilter::Prefilter(Op op) {
23 if (op_ == AND || op_ == OR)
24 subs_ = new vector<Prefilter*>;
26 alloc_id_ = alloc_id++;
27 VLOG(10) << "alloc_id: " << alloc_id_;
30 // Destroys a Prefilter.
31 Prefilter::~Prefilter() {
32 VLOG(10) << "Deleted: " << alloc_id_;
34 for (int i = 0; i < subs_->size(); i++)
41 // Simplify if the node is an empty Or or And.
42 Prefilter* Prefilter::Simplify() {
43 if (op_ != AND && op_ != OR) {
47 // Nothing left in the AND/OR.
48 if (subs_->size() == 0) {
50 op_ = ALL; // AND of nothing is true
52 op_ = NONE; // OR of nothing is false
57 // Just one subnode: throw away wrapper.
58 if (subs_->size() == 1) {
59 Prefilter* a = (*subs_)[0];
68 // Combines two Prefilters together to create an "op" (AND or OR).
69 // The passed Prefilters will be part of the returned Prefilter or deleted.
70 // Does lots of work to avoid creating unnecessarily complicated structures.
71 Prefilter* Prefilter::AndOr(Op op, Prefilter* a, Prefilter* b) {
72 // If a, b can be rewritten as op, do so.
76 // Canonicalize: a->op <= b->op.
77 if (a->op() > b->op()) {
88 // Don't need to look at b, because of canonicalization above.
89 // ALL and NONE are smallest opcodes.
90 if (a->op() == ALL || a->op() == NONE) {
91 if ((a->op() == ALL && op == AND) ||
92 (a->op() == NONE && op == OR)) {
101 // If a and b match op, merge their contents.
102 if (a->op() == op && b->op() == op) {
103 for (int i = 0; i < b->subs()->size(); i++) {
104 Prefilter* bb = (*b->subs())[i];
105 a->subs()->push_back(bb);
112 // If a already has the same op as the op that is under construction
113 // add in b (similarly if b already has the same op, add in a).
120 a->subs()->push_back(b);
124 // Otherwise just return the op.
125 Prefilter* c = new Prefilter(op);
126 c->subs()->push_back(a);
127 c->subs()->push_back(b);
131 Prefilter* Prefilter::And(Prefilter* a, Prefilter* b) {
132 return AndOr(AND, a, b);
135 Prefilter* Prefilter::Or(Prefilter* a, Prefilter* b) {
136 return AndOr(OR, a, b);
139 static void SimplifyStringSet(set<string> *ss) {
140 // Now make sure that the strings aren't redundant. For example, if
141 // we know "ab" is a required string, then it doesn't help at all to
142 // know that "abc" is also a required string, so delete "abc". This
143 // is because, when we are performing a string search to filter
144 // regexps, matching ab will already allow this regexp to be a
145 // candidate for match, so further matching abc is redundant.
147 for (SSIter i = ss->begin(); i != ss->end(); ++i) {
150 while (j != ss->end()) {
151 // Increment j early so that we can erase the element it points to.
154 if (old_j->find(*i) != string::npos)
160 Prefilter* Prefilter::OrStrings(set<string>* ss) {
161 SimplifyStringSet(ss);
162 Prefilter* or_prefilter = NULL;
164 or_prefilter = new Prefilter(NONE);
165 for (SSIter i = ss->begin(); i != ss->end(); ++i)
166 or_prefilter = Or(or_prefilter, FromString(*i));
171 static Rune ToLowerRune(Rune r) {
173 if ('A' <= r && r <= 'Z')
178 CaseFold *f = LookupCaseFold(unicode_tolower, num_unicode_tolower, r);
179 if (f == NULL || r < f->lo)
181 return ApplyFold(f, r);
184 static Rune ToLowerRuneLatin1(Rune r) {
185 if ('A' <= r && r <= 'Z')
190 Prefilter* Prefilter::FromString(const string& str) {
191 Prefilter* m = new Prefilter(Prefilter::ATOM);
196 // Information about a regexp used during computation of Prefilter.
197 // Can be thought of as information about the set of strings matching
198 // the given regular expression.
199 class Prefilter::Info {
204 // More constructors. They delete their Info* arguments.
205 static Info* Alt(Info* a, Info* b);
206 static Info* Concat(Info* a, Info* b);
207 static Info* And(Info* a, Info* b);
208 static Info* Star(Info* a);
209 static Info* Plus(Info* a);
210 static Info* Quest(Info* a);
211 static Info* EmptyString();
212 static Info* NoMatch();
213 static Info* AnyChar();
214 static Info* CClass(CharClass* cc, bool latin1);
215 static Info* Literal(Rune r);
216 static Info* LiteralLatin1(Rune r);
217 static Info* AnyMatch();
219 // Format Info as a string.
222 // Caller takes ownership of the Prefilter.
223 Prefilter* TakeMatch();
225 set<string>& exact() { return exact_; }
227 bool is_exact() const { return is_exact_; }
234 // When is_exact_ is true, the strings that match
235 // are placed in exact_. When it is no longer an exact
236 // set of strings that match this RE, then is_exact_
237 // is false and the match_ contains the required match
241 // Accumulated Prefilter query that any
242 // match for this regexp is guaranteed to match.
247 Prefilter::Info::Info()
252 Prefilter::Info::~Info() {
256 Prefilter* Prefilter::Info::TakeMatch() {
258 match_ = Prefilter::OrStrings(&exact_);
261 Prefilter* m = match_;
266 // Format a Info in string form.
267 string Prefilter::Info::ToString() {
269 // Sometimes when iterating on children of a node,
270 // some children might have NULL Info. Adding
271 // the check here for NULL to take care of cases where
272 // the caller is not checking.
279 for (set<string>::iterator i = exact_.begin(); i != exact_.end(); ++i) {
288 return match_->DebugString();
293 // Add the strings from src to dst.
294 static void CopyIn(const set<string>& src, set<string>* dst) {
295 for (ConstSSIter i = src.begin(); i != src.end(); ++i)
299 // Add the cross-product of a and b to dst.
300 // (For each string i in a and j in b, add i+j.)
301 static void CrossProduct(const set<string>& a,
302 const set<string>& b,
304 for (ConstSSIter i = a.begin(); i != a.end(); ++i)
305 for (ConstSSIter j = b.begin(); j != b.end(); ++j)
306 dst->insert(*i + *j);
309 // Concats a and b. Requires that both are exact sets.
310 // Forms an exact set that is a crossproduct of a and b.
311 Prefilter::Info* Prefilter::Info::Concat(Info* a, Info* b) {
314 DCHECK(a->is_exact_);
315 DCHECK(b && b->is_exact_);
316 Info *ab = new Info();
318 CrossProduct(a->exact_, b->exact_, &ab->exact_);
319 ab->is_exact_ = true;
326 // Constructs an inexact Info for ab given a and b.
327 // Used only when a or b is not exact or when the
328 // exact cross product is likely to be too big.
329 Prefilter::Info* Prefilter::Info::And(Info* a, Info* b) {
335 Info *ab = new Info();
337 ab->match_ = Prefilter::And(a->TakeMatch(), b->TakeMatch());
338 ab->is_exact_ = false;
344 // Constructs Info for a|b given a and b.
345 Prefilter::Info* Prefilter::Info::Alt(Info* a, Info* b) {
346 Info *ab = new Info();
348 if (a->is_exact_ && b->is_exact_) {
349 CopyIn(a->exact_, &ab->exact_);
350 CopyIn(b->exact_, &ab->exact_);
351 ab->is_exact_ = true;
353 // Either a or b has is_exact_ = false. If the other
354 // one has is_exact_ = true, we move it to match_ and
355 // then create a OR of a,b. The resulting Info has
356 // is_exact_ = false.
357 ab->match_ = Prefilter::Or(a->TakeMatch(), b->TakeMatch());
358 ab->is_exact_ = false;
366 // Constructs Info for a? given a.
367 Prefilter::Info* Prefilter::Info::Quest(Info *a) {
368 Info *ab = new Info();
370 ab->is_exact_ = false;
371 ab->match_ = new Prefilter(ALL);
376 // Constructs Info for a* given a.
377 // Same as a? -- not much to do.
378 Prefilter::Info* Prefilter::Info::Star(Info *a) {
382 // Constructs Info for a+ given a. If a was exact set, it isn't
384 Prefilter::Info* Prefilter::Info::Plus(Info *a) {
385 Info *ab = new Info();
387 ab->match_ = a->TakeMatch();
388 ab->is_exact_ = false;
394 static string RuneToString(Rune r) {
396 int n = runetochar(buf, &r);
397 return string(buf, n);
400 static string RuneToStringLatin1(Rune r) {
402 return string(&c, 1);
405 // Constructs Info for literal rune.
406 Prefilter::Info* Prefilter::Info::Literal(Rune r) {
407 Info* info = new Info();
408 info->exact_.insert(RuneToString(ToLowerRune(r)));
409 info->is_exact_ = true;
413 // Constructs Info for literal rune for Latin1 encoded string.
414 Prefilter::Info* Prefilter::Info::LiteralLatin1(Rune r) {
415 Info* info = new Info();
416 info->exact_.insert(RuneToStringLatin1(ToLowerRuneLatin1(r)));
417 info->is_exact_ = true;
421 // Constructs Info for dot (any character).
422 Prefilter::Info* Prefilter::Info::AnyChar() {
423 Prefilter::Info* info = new Prefilter::Info();
424 info->match_ = new Prefilter(ALL);
428 // Constructs Prefilter::Info for no possible match.
429 Prefilter::Info* Prefilter::Info::NoMatch() {
430 Prefilter::Info* info = new Prefilter::Info();
431 info->match_ = new Prefilter(NONE);
435 // Constructs Prefilter::Info for any possible match.
436 // This Prefilter::Info is valid for any regular expression,
437 // since it makes no assertions whatsoever about the
438 // strings being matched.
439 Prefilter::Info* Prefilter::Info::AnyMatch() {
440 Prefilter::Info *info = new Prefilter::Info();
441 info->match_ = new Prefilter(ALL);
445 // Constructs Prefilter::Info for just the empty string.
446 Prefilter::Info* Prefilter::Info::EmptyString() {
447 Prefilter::Info* info = new Prefilter::Info();
448 info->is_exact_ = true;
449 info->exact_.insert("");
453 // Constructs Prefilter::Info for a character class.
454 typedef CharClass::iterator CCIter;
455 Prefilter::Info* Prefilter::Info::CClass(CharClass *cc,
458 VLOG(0) << "CharClassInfo:";
459 for (CCIter i = cc->begin(); i != cc->end(); ++i)
460 VLOG(0) << " " << i->lo << "-" << i->hi;
463 // If the class is too large, it's okay to overestimate.
467 Prefilter::Info *a = new Prefilter::Info();
468 for (CCIter i = cc->begin(); i != cc->end(); ++i)
469 for (Rune r = i->lo; r <= i->hi; r++) {
471 a->exact_.insert(RuneToStringLatin1(ToLowerRuneLatin1(r)));
473 a->exact_.insert(RuneToString(ToLowerRune(r)));
481 VLOG(0) << " = " << a->ToString();
487 class Prefilter::Info::Walker : public Regexp::Walker<Prefilter::Info*> {
489 Walker(bool latin1) : latin1_(latin1) {}
491 virtual Info* PostVisit(
492 Regexp* re, Info* parent_arg,
494 Info** child_args, int nchild_args);
496 virtual Info* ShortVisit(
500 bool latin1() { return latin1_; }
503 DISALLOW_EVIL_CONSTRUCTORS(Walker);
506 Prefilter::Info* Prefilter::BuildInfo(Regexp* re) {
508 LOG(INFO) << "BuildPrefilter::Info: " << re->ToString();
511 bool latin1 = re->parse_flags() & Regexp::Latin1;
512 Prefilter::Info::Walker w(latin1);
513 Prefilter::Info* info = w.WalkExponential(re, NULL, 100000);
515 if (w.stopped_early()) {
523 Prefilter::Info* Prefilter::Info::Walker::ShortVisit(
524 Regexp* re, Prefilter::Info* parent_arg) {
528 // Constructs the Prefilter::Info for the given regular expression.
529 // Assumes re is simplified.
530 Prefilter::Info* Prefilter::Info::Walker::PostVisit(
531 Regexp* re, Prefilter::Info* parent_arg,
532 Prefilter::Info* pre_arg, Prefilter::Info** child_args,
534 Prefilter::Info *info;
538 LOG(DFATAL) << "Bad regexp op " << re->op();
539 info = EmptyString();
546 // These ops match the empty string:
547 case kRegexpEmptyMatch: // anywhere
548 case kRegexpBeginLine: // at beginning of line
549 case kRegexpEndLine: // at end of line
550 case kRegexpBeginText: // at beginning of text
551 case kRegexpEndText: // at end of text
552 case kRegexpWordBoundary: // at word boundary
553 case kRegexpNoWordBoundary: // not at word boundary
554 info = EmptyString();
559 info = LiteralLatin1(re->rune());
562 info = Literal(re->rune());
566 case kRegexpLiteralString:
567 if (re->nrunes() == 0) {
572 info = LiteralLatin1(re->runes()[0]);
573 for (int i = 1; i < re->nrunes(); i++) {
574 info = Concat(info, LiteralLatin1(re->runes()[i]));
577 info = Literal(re->runes()[0]);
578 for (int i = 1; i < re->nrunes(); i++) {
579 info = Concat(info, Literal(re->runes()[i]));
584 case kRegexpConcat: {
585 // Accumulate in info.
586 // Exact is concat of recent contiguous exact nodes.
589 for (int i = 0; i < nchild_args; i++) {
590 Info* ci = child_args[i]; // child info
591 if (!ci->is_exact() ||
592 (exact && ci->exact().size() * exact->exact().size() > 16)) {
593 // Exact run is over.
594 info = And(info, exact);
596 // Add this child's info.
597 info = And(info, ci);
599 // Append to exact run.
600 exact = Concat(exact, ci);
603 info = And(info, exact);
607 case kRegexpAlternate:
608 info = child_args[0];
609 for (int i = 1; i < nchild_args; i++)
610 info = Alt(info, child_args[i]);
611 VLOG(10) << "Alt: " << info->ToString();
615 info = Star(child_args[0]);
619 info = Quest(child_args[0]);
623 info = Plus(child_args[0]);
627 // Claim nothing, except that it's not empty.
631 case kRegexpCharClass:
632 info = CClass(re->cc(), latin1());
636 // These don't affect the set of matching strings.
637 info = child_args[0];
642 VLOG(0) << "BuildInfo " << re->ToString()
643 << ": " << info->ToString();
650 Prefilter* Prefilter::FromRegexp(Regexp* re) {
654 Regexp* simple = re->Simplify();
655 Prefilter::Info *info = BuildInfo(simple);
661 Prefilter* m = info->TakeMatch();
667 string Prefilter::DebugString() const {
673 LOG(DFATAL) << "Bad op in Prefilter::DebugString: " << op_;
674 return StringPrintf("op%d", op_);
676 return "*no-matches*";
683 for (int i = 0; i < subs_->size(); i++) {
686 s += (*subs_)[i]->DebugString();
692 for (int i = 0; i < subs_->size(); i++) {
695 s += (*subs_)[i]->DebugString();
703 Prefilter* Prefilter::FromRE2(const RE2* re2) {
707 Regexp* regexp = re2->Regexp();
711 return FromRegexp(regexp);