1 // Copyright 2006 The RE2 Authors. All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
10 #include "re2/regexp.h"
16 const char* simplified;
19 static Test tests[] = {
20 // Already-simple constructs
34 // Posix character classes
35 { "[[:alnum:]]", "[0-9A-Za-z]" },
36 { "[[:alpha:]]", "[A-Za-z]" },
37 { "[[:blank:]]", "[\\t ]" },
38 { "[[:cntrl:]]", "[\\x00-\\x1f\\x7f]" },
39 { "[[:digit:]]", "[0-9]" },
40 { "[[:graph:]]", "[!-~]" },
41 { "[[:lower:]]", "[a-z]" },
42 { "[[:print:]]", "[ -~]" },
43 { "[[:punct:]]", "[!-/:-@\\[-`{-~]" },
44 { "[[:space:]]" , "[\\t-\\r ]" },
45 { "[[:upper:]]", "[A-Z]" },
46 { "[[:xdigit:]]", "[0-9A-Fa-f]" },
48 // Perl character classes
50 { "\\s", "[\\t-\\n\\f-\\r ]" },
51 { "\\w", "[0-9A-Z_a-z]" },
53 { "\\S", "[^\\t-\\n\\f-\\r ]" },
54 { "\\W", "[^0-9A-Z_a-z]" },
56 { "[\\s]", "[\\t-\\n\\f-\\r ]" },
57 { "[\\w]", "[0-9A-Z_a-z]" },
58 { "[\\D]", "[^0-9]" },
59 { "[\\S]", "[^\\t-\\n\\f-\\r ]" },
60 { "[\\W]", "[^0-9A-Z_a-z]" },
67 // The next three are illegible because Simplify inserts (?:)
68 // parens instead of () parens to avoid creating extra
69 // captured subexpressions. The comments show a version fewer parens.
70 { "(a){0,2}", "(?:(a)(a)?)?" }, // (aa?)?
71 { "(a){0,4}", "(?:(a)(?:(a)(?:(a)(a)?)?)?)?" }, // (a(a(aa?)?)?)?
72 { "(a){2,6}", "(a)(a)(?:(a)(?:(a)(?:(a)(a)?)?)?)?" }, // aa(a(a(aa?)?)?)?
73 { "a{0,2}", "(?:aa?)?" }, // (aa?)?
74 { "a{0,4}", "(?:a(?:a(?:aa?)?)?)?" }, // (a(a(aa?)?)?)?
75 { "a{2,6}", "aa(?:a(?:a(?:aa?)?)?)?" }, // aa(a(a(aa?)?)?)?
79 { "a{5,}", "aaaaa+" },
81 // Test that operators simplify their arguments.
82 // (Simplify used to not simplify arguments to a {} repeat.)
83 { "(?:a{1,}){1,}", "a+" },
84 { "(a{1,}b{1,})", "(a+b+)" },
85 { "a{1,}|b{1,}", "a+|b+" },
86 { "(?:a{1,})*", "(?:a+)*" },
87 { "(?:a{1,})+", "a+" },
88 { "(?:a{1,})?", "(?:a+)?" },
91 // Character class simplification
93 { "[a-za-za-z]", "[a-z]" },
94 { "[A-Za-zA-Za-z]", "[A-Za-z]" },
95 { "[ABCDEFGH]", "[A-H]" },
96 { "[AB-CD-EF-GH]", "[A-H]" },
97 { "[W-ZP-XE-R]", "[E-Z]" },
98 { "[a-ee-gg-m]", "[a-m]" },
99 { "[a-ea-ha-m]", "[a-m]" },
100 { "[a-ma-ha-e]", "[a-m]" },
101 { "[a-zA-Z0-9 -~]", "[ -~]" },
103 // Empty character classes
104 { "[^[:cntrl:][:^cntrl:]]", "[^\\x00-\\x{10ffff}]" },
106 // Full character classes
107 { "[[:cntrl:][:^cntrl:]]", "." },
109 // Unicode case folding.
112 { "(?i)K", "[Kk\\x{212a}]" },
113 { "(?i)k", "[Kk\\x{212a}]" },
114 { "(?i)\\x{212a}", "[Kk\\x{212a}]" },
115 { "(?i)[a-z]", "[A-Za-z\\x{17f}\\x{212a}]" },
116 { "(?i)[\\x00-\\x{FFFD}]", "[\\x00-\\x{fffd}]" },
117 { "(?i)[\\x00-\\x{10ffff}]", "." },
119 // Empty string as a regular expression.
120 // Empty string must be preserved inside parens in order
121 // to make submatches work right, so these are less
122 // interesting than they used to be. ToString inserts
123 // explicit (?:) in place of non-parenthesized empty strings,
124 // to make them easier to spot for other parsers.
125 { "(a|b|)", "([a-b]|(?:))" },
128 { "(()|())", "(()|())" },
129 { "(a|)", "(a|(?:))" },
130 { "ab()cd()", "ab()cd()" },
138 { "(){0,2}", "(?:()()?)?" },
141 TEST(TestSimplify, SimpleRegexps) {
142 for (int i = 0; i < arraysize(tests); i++) {
144 VLOG(1) << "Testing " << tests[i].regexp;
145 Regexp* re = Regexp::Parse(tests[i].regexp,
146 Regexp::MatchNL | (Regexp::LikePerl &
149 CHECK(re != NULL) << " " << tests[i].regexp << " " << status.Text();
150 Regexp* sre = re->Simplify();
153 // Check that already-simple regexps don't allocate new ones.
154 if (strcmp(tests[i].regexp, tests[i].simplified) == 0) {
155 CHECK(re == sre) << " " << tests[i].regexp
156 << " " << re->ToString() << " " << sre->ToString();
159 EXPECT_EQ(tests[i].simplified, sre->ToString())
160 << " " << tests[i].regexp << " " << sre->Dump();