1 // Copyright 2008 The RE2 Authors. All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
5 // Regular expression engine tester -- test all the implementations against each other.
8 #include "util/flags.h"
9 #include "re2/testing/tester.h"
12 #include "re2/regexp.h"
14 DEFINE_bool(dump_prog, false, "dump regexp program");
15 DEFINE_bool(log_okay, false, "log successful runs");
16 DEFINE_bool(dump_rprog, false, "dump reversed regexp program");
18 DEFINE_int32(max_regexp_failures, 100,
19 "maximum number of regexp test failures (-1 = unlimited)");
21 DEFINE_string(regexp_engines, "", "pattern to select regexp engines to test");
26 kMaxSubmatch = 1+16, // $0...$16
29 const char* engine_types[kEngineMax] = {
42 // Returns the name string for the type t.
43 static string EngineString(Engine t) {
44 if (t < 0 || t >= arraysize(engine_types) || engine_types[t] == NULL) {
45 return StringPrintf("type%d", static_cast<int>(t));
47 return engine_types[t];
50 // Returns bit mask of engines to use.
51 static uint32 Engines() {
52 static uint32 cached_engines;
53 static bool did_parse;
56 return cached_engines;
58 if (FLAGS_regexp_engines.empty()) {
61 for (Engine i = static_cast<Engine>(0); i < kEngineMax; i++)
62 if (strstr(EngineString(i).c_str(), FLAGS_regexp_engines.c_str()))
63 cached_engines |= 1<<i;
66 if (cached_engines == 0)
67 LOG(INFO) << "Warning: no engines enabled.";
69 cached_engines &= ~(1<<kEnginePCRE);
70 for (Engine i = static_cast<Engine>(0); i < kEngineMax; i++) {
71 if (cached_engines & (1<<i))
72 LOG(INFO) << EngineString(i) << " enabled";
75 return cached_engines;
78 // The result of running a match.
79 struct TestInstance::Result {
80 bool skipped; // test skipped: wasn't applicable
81 bool matched; // found a match
82 bool untrusted; // don't really trust the answer
83 bool have_submatch; // computed all submatch info
84 bool have_submatch0; // computed just submatch[0]
85 StringPiece submatch[kMaxSubmatch];
88 typedef TestInstance::Result Result;
90 // Formats a single capture range s in text in the form (a,b)
91 // where a and b are the starting and ending offsets of s in text.
92 static string FormatCapture(const StringPiece& text, const StringPiece& s) {
93 if (s.begin() == NULL)
95 return StringPrintf("(%d,%d)",
96 static_cast<int>(s.begin() - text.begin()),
97 static_cast<int>(s.end() - text.begin()));
100 // Returns whether text contains non-ASCII (>= 0x80) bytes.
101 static bool NonASCII(const StringPiece& text) {
102 for (int i = 0; i < text.size(); i++)
103 if ((uint8)text[i] >= 0x80)
108 // Returns string representation of match kind.
109 static string FormatKind(Prog::MatchKind kind) {
111 case Prog::kFullMatch:
113 case Prog::kLongestMatch:
114 return "longest match";
115 case Prog::kFirstMatch:
116 return "first match";
117 case Prog::kManyMatch:
123 // Returns string representation of anchor kind.
124 static string FormatAnchor(Prog::Anchor anchor) {
126 case Prog::kAnchored:
128 case Prog::kUnanchored:
135 Regexp::ParseFlags parse_flags;
139 static const Regexp::ParseFlags single_line =
141 static const Regexp::ParseFlags multi_line =
142 static_cast<Regexp::ParseFlags>(Regexp::LikePerl & ~Regexp::OneLine);
144 static ParseMode parse_modes[] = {
145 { single_line, "single-line" },
146 { single_line|Regexp::Latin1, "single-line, latin1" },
147 { multi_line, "multiline" },
148 { multi_line|Regexp::NonGreedy, "multiline, nongreedy" },
149 { multi_line|Regexp::Latin1, "multiline, latin1" },
152 static string FormatMode(Regexp::ParseFlags flags) {
153 for (int i = 0; i < arraysize(parse_modes); i++)
154 if (parse_modes[i].parse_flags == flags)
155 return parse_modes[i].desc;
156 return StringPrintf("%#x", static_cast<uint>(flags));
159 // Constructs and saves all the matching engines that
160 // will be required for the given tests.
161 TestInstance::TestInstance(const StringPiece& regexp_str, Prog::MatchKind kind,
162 Regexp::ParseFlags flags)
163 : regexp_str_(regexp_str),
174 VLOG(1) << CEscape(regexp_str);
176 // Compile regexp to prog.
177 // Always required - needed for backtracking (reference implementation).
179 regexp_ = Regexp::Parse(regexp_str, flags, &status);
180 if (regexp_ == NULL) {
181 LOG(INFO) << "Cannot parse: " << CEscape(regexp_str_)
182 << " mode: " << FormatMode(flags);
186 num_captures_ = regexp_->NumCaptures();
187 prog_ = regexp_->CompileToProg(0);
189 LOG(INFO) << "Cannot compile: " << CEscape(regexp_str_);
193 if (FLAGS_dump_prog) {
194 LOG(INFO) << "Prog for "
196 << CEscape(regexp_str_)
197 << " (" << FormatKind(kind_)
198 << ", " << FormatMode(flags_)
203 // Compile regexp to reversed prog. Only needed for DFA engines.
204 if (Engines() & ((1<<kEngineDFA)|(1<<kEngineDFA1))) {
205 rprog_ = regexp_->CompileToReverseProg(0);
206 if (rprog_ == NULL) {
207 LOG(INFO) << "Cannot reverse compile: " << CEscape(regexp_str_);
211 if (FLAGS_dump_rprog)
212 LOG(INFO) << rprog_->Dump();
215 // Create re string that will be used for RE and RE2.
216 string re = regexp_str.as_string();
218 // Regexp::Latin1 will be accomodated below.
219 if (!(flags & Regexp::OneLine))
221 if (flags & Regexp::NonGreedy)
223 if (flags & Regexp::DotNL)
226 // Compile regexp to RE2.
227 if (Engines() & ((1<<kEngineRE2)|(1<<kEngineRE2a)|(1<<kEngineRE2b))) {
228 RE2::Options options;
229 if (flags & Regexp::Latin1)
230 options.set_encoding(RE2::Options::EncodingLatin1);
231 if (kind_ == Prog::kLongestMatch)
232 options.set_longest_match(true);
233 re2_ = new RE2(re, options);
234 if (!re2_->error().empty()) {
235 LOG(INFO) << "Cannot RE2: " << CEscape(re);
241 // Compile regexp to RE.
242 // PCRE as exposed by the RE interface isn't always usable.
243 // 1. It disagrees about handling of empty-string reptitions
244 // like matching (a*)* against "b". PCRE treats the (a*) as
245 // occurring once, while we treat it as occurring not at all.
246 // 2. It treats $ as this weird thing meaning end of string
247 // or before the \n at the end of the string.
248 // 3. It doesn't implement POSIX leftmost-longest matching.
249 // MimicsPCRE() detects 1 and 2.
250 if ((Engines() & (1<<kEnginePCRE)) && regexp_->MimicsPCRE() &&
251 kind_ != Prog::kLongestMatch) {
253 o.set_option(PCRE::UTF8);
254 if (flags & Regexp::Latin1)
255 o.set_option(PCRE::None);
256 // PCRE has interface bug keeping us from finding $0, so
257 // add one more layer of parens.
258 re_ = new PCRE("("+re+")", o);
259 if (!re_->error().empty()) {
260 LOG(INFO) << "Cannot PCRE: " << CEscape(re);
267 TestInstance::~TestInstance() {
276 // Runs a single search using the named engine type.
277 // This interface hides all the irregularities of the various
278 // engine interfaces from the rest of this file.
279 void TestInstance::RunSearch(Engine type,
280 const StringPiece& orig_text,
281 const StringPiece& orig_context,
284 memset(result, 0, sizeof *result);
285 if (regexp_ == NULL) {
286 result->skipped = true;
289 int nsubmatch = 1 + num_captures_; // NumCaptures doesn't count $0
290 if (nsubmatch > kMaxSubmatch)
291 nsubmatch = kMaxSubmatch;
293 StringPiece text = orig_text;
294 StringPiece context = orig_context;
298 LOG(FATAL) << "Bad RunSearch type: " << (int)type;
300 case kEngineBacktrack:
302 result->skipped = true;
306 prog_->UnsafeSearchBacktrack(text, context, anchor, kind_,
307 result->submatch, nsubmatch);
308 result->have_submatch = true;
313 result->skipped = true;
317 prog_->SearchNFA(text, context, anchor, kind_,
318 result->submatch, nsubmatch);
319 result->have_submatch = true;
324 result->skipped = true;
327 result->matched = prog_->SearchDFA(text, context, anchor, kind_, NULL,
328 &result->skipped, NULL);
332 if (prog_ == NULL || rprog_ == NULL) {
333 result->skipped = true;
337 prog_->SearchDFA(text, context, anchor, kind_, result->submatch,
338 &result->skipped, NULL);
339 // If anchored, no need for second run,
340 // but do it anyway to find more bugs.
341 if (result->matched) {
342 if (!rprog_->SearchDFA(result->submatch[0], context,
343 Prog::kAnchored, Prog::kLongestMatch,
345 &result->skipped, NULL)) {
346 LOG(ERROR) << "Reverse DFA inconsistency: " << CEscape(regexp_str_)
347 << " on " << CEscape(text);
348 result->matched = false;
351 result->have_submatch0 = true;
356 anchor == Prog::kUnanchored ||
357 !prog_->IsOnePass() ||
358 nsubmatch > Prog::kMaxOnePassCapture) {
359 result->skipped = true;
362 result->matched = prog_->SearchOnePass(text, context, anchor, kind_,
363 result->submatch, nsubmatch);
364 result->have_submatch = true;
367 case kEngineBitState:
369 result->skipped = true;
372 result->matched = prog_->SearchBitState(text, context, anchor, kind_,
373 result->submatch, nsubmatch);
374 result->have_submatch = true;
380 if (!re2_ || text.end() != context.end()) {
381 result->skipped = true;
385 RE2::Anchor re_anchor;
386 if (anchor == Prog::kAnchored)
387 re_anchor = RE2::ANCHOR_START;
389 re_anchor = RE2::UNANCHORED;
390 if (kind_ == Prog::kFullMatch)
391 re_anchor = RE2::ANCHOR_BOTH;
393 result->matched = re2_->Match(context,
394 text.begin() - context.begin(),
395 text.end() - context.begin(),
396 re_anchor, result->submatch, nsubmatch);
397 result->have_submatch = nsubmatch > 0;
402 if (!re_ || text.begin() != context.begin() ||
403 text.end() != context.end()) {
404 result->skipped = true;
408 const PCRE::Arg **argptr = new const PCRE::Arg*[nsubmatch];
409 PCRE::Arg *a = new PCRE::Arg[nsubmatch];
410 for (int i = 0; i < nsubmatch; i++) {
411 a[i] = PCRE::Arg(&result->submatch[i]);
415 PCRE::Anchor pcre_anchor;
416 if (anchor == Prog::kAnchored)
417 pcre_anchor = PCRE::ANCHOR_START;
419 pcre_anchor = PCRE::UNANCHORED;
420 if (kind_ == Prog::kFullMatch)
421 pcre_anchor = PCRE::ANCHOR_BOTH;
422 re_->ClearHitLimit();
428 if (re_->HitLimit()) {
429 result->untrusted = true;
434 result->have_submatch = true;
436 // Work around RE interface bug: PCRE returns -1 as the
437 // offsets for an unmatched subexpression, and RE should
438 // turn that into StringPiece(NULL) but in fact it uses
439 // StringPiece(text.begin() - 1, 0). Oops.
440 for (int i = 0; i < nsubmatch; i++)
441 if (result->submatch[i].begin() == text.begin() - 1)
442 result->submatch[i] = NULL;
449 if (!result->matched)
450 memset(result->submatch, 0, sizeof result->submatch);
453 // Checks whether r is okay given that correct is the right answer.
454 // Specifically, r's answers have to match (but it doesn't have to
455 // claim to have all the answers).
456 static bool ResultOkay(const Result& r, const Result& correct) {
459 if (r.matched != correct.matched)
461 if (r.have_submatch || r.have_submatch0) {
462 for (int i = 0; i < kMaxSubmatch; i++) {
463 if (correct.submatch[i].begin() != r.submatch[i].begin() ||
464 correct.submatch[i].size() != r.submatch[i].size())
466 if (!r.have_submatch)
473 // Runs a single test.
474 bool TestInstance::RunCase(const StringPiece& text, const StringPiece& context,
475 Prog::Anchor anchor) {
476 // Backtracking is the gold standard.
478 RunSearch(kEngineBacktrack, text, context, anchor, &correct);
479 if (correct.skipped) {
482 LOG(ERROR) << "Skipped backtracking! " << CEscape(regexp_str_)
483 << " " << FormatMode(flags_);
486 VLOG(1) << "Try: regexp " << CEscape(regexp_str_)
487 << " text " << CEscape(text)
488 << " (" << FormatKind(kind_)
489 << ", " << FormatAnchor(anchor)
490 << ", " << FormatMode(flags_)
493 // Compare the others.
494 bool all_okay = true;
495 for (Engine i = kEngineBacktrack+1; i < kEngineMax; i++) {
496 if (!(Engines() & (1<<i)))
500 RunSearch(i, text, context, anchor, &r);
501 if (ResultOkay(r, correct)) {
503 LogMatch(r.skipped ? "Skipped: " : "Okay: ", i, text, context, anchor);
507 // We disagree with PCRE on the meaning of some Unicode matches.
508 // In particular, we treat all non-ASCII UTF-8 as word characters.
509 // We also treat "empty" character sets like [^\w\W] as being
510 // impossible to match, while PCRE apparently excludes some code
511 // points (e.g., 0x0080) from both \w and \W.
512 if (i == kEnginePCRE && NonASCII(text))
518 LogMatch(r.untrusted ? "(Untrusted) Mismatch: " : "Mismatch: ", i, text,
520 if (r.matched != correct.matched) {
522 LOG(INFO) << " Should not match (but does).";
524 LOG(INFO) << " Should match (but does not).";
528 for (int i = 0; i < 1+num_captures_; i++) {
529 if (r.submatch[i].begin() != correct.submatch[i].begin() ||
530 r.submatch[i].end() != correct.submatch[i].end()) {
532 StringPrintf(" $%d: should be %s is %s",
534 FormatCapture(text, correct.submatch[i]).c_str(),
535 FormatCapture(text, r.submatch[i]).c_str());
538 StringPrintf(" $%d: %s ok", i,
539 FormatCapture(text, r.submatch[i]).c_str());
545 if (FLAGS_max_regexp_failures > 0 && --FLAGS_max_regexp_failures == 0)
546 LOG(QFATAL) << "Too many regexp failures.";
552 void TestInstance::LogMatch(const char* prefix, Engine e,
553 const StringPiece& text, const StringPiece& context,
554 Prog::Anchor anchor) {
558 << CEscape(regexp_str_)
560 << CEscape(regexp_->ToString())
564 << text.begin() - context.begin()
566 << text.end() - context.begin()
569 << " (" << FormatKind(kind_)
570 << ", " << FormatAnchor(anchor)
571 << ", " << FormatMode(flags_)
575 static Prog::MatchKind kinds[] = {
581 // Test all possible match kinds and parse modes.
582 Tester::Tester(const StringPiece& regexp) {
584 for (int i = 0; i < arraysize(kinds); i++) {
585 for (int j = 0; j < arraysize(parse_modes); j++) {
586 TestInstance* t = new TestInstance(regexp, kinds[i],
587 parse_modes[j].parse_flags);
588 error_ |= t->error();
595 for (int i = 0; i < v_.size(); i++)
599 bool Tester::TestCase(const StringPiece& text, const StringPiece& context,
600 Prog::Anchor anchor) {
602 for (int i = 0; i < v_.size(); i++)
603 okay &= (!v_[i]->error() && v_[i]->RunCase(text, context, anchor));
607 static Prog::Anchor anchors[] = {
612 bool Tester::TestInput(const StringPiece& text) {
613 bool okay = TestInputInContext(text, text);
614 if (text.size() > 0) {
618 okay &= TestInputInContext(sp, text);
621 okay &= TestInputInContext(sp, text);
626 bool Tester::TestInputInContext(const StringPiece& text,
627 const StringPiece& context) {
629 for (int i = 0; i < arraysize(anchors); i++)
630 okay &= TestCase(text, context, anchors[i]);
634 bool TestRegexpOnText(const StringPiece& regexp,
635 const StringPiece& text) {
637 return t.TestInput(text);