1 // Copyright 2011 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
14 // An Error describes a failure to parse a regular expression
15 // and gives the offending expression.
21 func (e *Error) Error() string {
22 return "error parsing regexp: " + e.Code.String() + ": `" + e.Expr + "`"
25 // An ErrorCode describes a failure to parse a regular expression.
30 ErrInternalError ErrorCode = "regexp/syntax: internal error"
33 ErrInvalidCharClass ErrorCode = "invalid character class"
34 ErrInvalidCharRange ErrorCode = "invalid character class range"
35 ErrInvalidEscape ErrorCode = "invalid escape sequence"
36 ErrInvalidNamedCapture ErrorCode = "invalid named capture"
37 ErrInvalidPerlOp ErrorCode = "invalid or unsupported Perl syntax"
38 ErrInvalidRepeatOp ErrorCode = "invalid nested repetition operator"
39 ErrInvalidRepeatSize ErrorCode = "invalid repeat count"
40 ErrInvalidUTF8 ErrorCode = "invalid UTF-8"
41 ErrMissingBracket ErrorCode = "missing closing ]"
42 ErrMissingParen ErrorCode = "missing closing )"
43 ErrMissingRepeatArgument ErrorCode = "missing argument to repetition operator"
44 ErrTrailingBackslash ErrorCode = "trailing backslash at end of expression"
47 // TODO: Export for Go 1.1.
48 const errUnexpectedParen ErrorCode = "unexpected )"
50 func (e ErrorCode) String() string {
54 // Flags control the behavior of the parser and record information about regexp context.
58 FoldCase Flags = 1 << iota // case-insensitive match
59 Literal // treat pattern as literal string
60 ClassNL // allow character classes like [^a-z] and [[:space:]] to match newline
61 DotNL // allow . to match newline
62 OneLine // treat ^ and $ as only matching at beginning and end of text
63 NonGreedy // make repetition operators default to non-greedy
64 PerlX // allow Perl extensions
65 UnicodeGroups // allow \p{Han}, \P{Han} for Unicode group and negation
66 WasDollar // regexp OpEndText was $, not \z
67 Simple // regexp contains no counted repetition
69 MatchNL = ClassNL | DotNL
71 Perl = ClassNL | OneLine | PerlX | UnicodeGroups // as close to Perl as possible
72 POSIX Flags = 0 // POSIX syntax
75 // Pseudo-ops for parsing stack.
77 opLeftParen = opPseudo + iota
82 flags Flags // parse mode flags
83 stack []*Regexp // stack of parsed expressions
85 numCap int // number of capturing groups seen
87 tmpClass []rune // temporary char class work space
90 func (p *parser) newRegexp(op Op) *Regexp {
102 func (p *parser) reuse(re *Regexp) {
107 // Parse stack manipulation.
109 // push pushes the regexp re onto the parse stack and returns the regexp.
110 func (p *parser) push(re *Regexp) *Regexp {
111 if re.Op == OpCharClass && len(re.Rune) == 2 && re.Rune[0] == re.Rune[1] {
113 if p.maybeConcat(re.Rune[0], p.flags&^FoldCase) {
117 re.Rune = re.Rune[:1]
118 re.Flags = p.flags &^ FoldCase
119 } else if re.Op == OpCharClass && len(re.Rune) == 4 &&
120 re.Rune[0] == re.Rune[1] && re.Rune[2] == re.Rune[3] &&
121 unicode.SimpleFold(re.Rune[0]) == re.Rune[2] &&
122 unicode.SimpleFold(re.Rune[2]) == re.Rune[0] ||
123 re.Op == OpCharClass && len(re.Rune) == 2 &&
124 re.Rune[0]+1 == re.Rune[1] &&
125 unicode.SimpleFold(re.Rune[0]) == re.Rune[1] &&
126 unicode.SimpleFold(re.Rune[1]) == re.Rune[0] {
127 // Case-insensitive rune like [Aa] or [Δδ].
128 if p.maybeConcat(re.Rune[0], p.flags|FoldCase) {
132 // Rewrite as (case-insensitive) literal.
134 re.Rune = re.Rune[:1]
135 re.Flags = p.flags | FoldCase
137 // Incremental concatenation.
141 p.stack = append(p.stack, re)
145 // maybeConcat implements incremental concatenation
146 // of literal runes into string nodes. The parser calls this
147 // before each push, so only the top fragment of the stack
148 // might need processing. Since this is called before a push,
149 // the topmost literal is no longer subject to operators like *
150 // (Otherwise ab* would turn into (ab)*.)
151 // If r >= 0 and there's a node left over, maybeConcat uses it
152 // to push r with the given flags.
153 // maybeConcat reports whether r was pushed.
154 func (p *parser) maybeConcat(r rune, flags Flags) bool {
162 if re1.Op != OpLiteral || re2.Op != OpLiteral || re1.Flags&FoldCase != re2.Flags&FoldCase {
166 // Push re1 into re2.
167 re2.Rune = append(re2.Rune, re1.Rune...)
169 // Reuse re1 if possible.
171 re1.Rune = re1.Rune0[:1]
177 p.stack = p.stack[:n-1]
179 return false // did not push r
182 // newLiteral returns a new OpLiteral Regexp with the given flags
183 func (p *parser) newLiteral(r rune, flags Flags) *Regexp {
184 re := p.newRegexp(OpLiteral)
186 if flags&FoldCase != 0 {
190 re.Rune = re.Rune0[:1]
194 // minFoldRune returns the minimum rune fold-equivalent to r.
195 func minFoldRune(r rune) rune {
196 if r < MinFold || r > MaxFold {
201 for r = unicode.SimpleFold(r); r != r0; r = unicode.SimpleFold(r) {
209 // literal pushes a literal regexp for the rune r on the stack
210 // and returns that regexp.
211 func (p *parser) literal(r rune) {
212 p.push(p.newLiteral(r, p.flags))
215 // op pushes a regexp with the given op onto the stack
216 // and returns that regexp.
217 func (p *parser) op(op Op) *Regexp {
218 re := p.newRegexp(op)
223 // repeat replaces the top stack element with itself repeated according to op, min, max.
224 // before is the regexp suffix starting at the repetition operator.
225 // after is the regexp suffix following after the repetition operator.
226 // repeat returns an updated 'after' and an error, if any.
227 func (p *parser) repeat(op Op, min, max int, before, after, lastRepeat string) (string, error) {
229 if p.flags&PerlX != 0 {
230 if len(after) > 0 && after[0] == '?' {
234 if lastRepeat != "" {
235 // In Perl it is not allowed to stack repetition operators:
236 // a** is a syntax error, not a doubled star, and a++ means
237 // something else entirely, which we don't support!
238 return "", &Error{ErrInvalidRepeatOp, lastRepeat[:len(lastRepeat)-len(after)]}
243 return "", &Error{ErrMissingRepeatArgument, before[:len(before)-len(after)]}
246 if sub.Op >= opPseudo {
247 return "", &Error{ErrMissingRepeatArgument, before[:len(before)-len(after)]}
249 re := p.newRegexp(op)
259 // concat replaces the top of the stack (above the topmost '|' or '(') with its concatenation.
260 func (p *parser) concat() *Regexp {
263 // Scan down to find pseudo-operator | or (.
265 for i > 0 && p.stack[i-1].Op < opPseudo {
269 p.stack = p.stack[:i]
271 // Empty concatenation is special case.
273 return p.push(p.newRegexp(OpEmptyMatch))
276 return p.push(p.collapse(subs, OpConcat))
279 // alternate replaces the top of the stack (above the topmost '(') with its alternation.
280 func (p *parser) alternate() *Regexp {
281 // Scan down to find pseudo-operator (.
282 // There are no | above (.
284 for i > 0 && p.stack[i-1].Op < opPseudo {
288 p.stack = p.stack[:i]
290 // Make sure top class is clean.
291 // All the others already are (see swapVerticalBar).
293 cleanAlt(subs[len(subs)-1])
296 // Empty alternate is special case
297 // (shouldn't happen but easy to handle).
299 return p.push(p.newRegexp(OpNoMatch))
302 return p.push(p.collapse(subs, OpAlternate))
305 // cleanAlt cleans re for eventual inclusion in an alternation.
306 func cleanAlt(re *Regexp) {
309 re.Rune = cleanClass(&re.Rune)
310 if len(re.Rune) == 2 && re.Rune[0] == 0 && re.Rune[1] == unicode.MaxRune {
315 if len(re.Rune) == 4 && re.Rune[0] == 0 && re.Rune[1] == '\n'-1 && re.Rune[2] == '\n'+1 && re.Rune[3] == unicode.MaxRune {
317 re.Op = OpAnyCharNotNL
320 if cap(re.Rune)-len(re.Rune) > 100 {
321 // re.Rune will not grow any more.
322 // Make a copy or inline to reclaim storage.
323 re.Rune = append(re.Rune0[:0], re.Rune...)
328 // collapse returns the result of applying op to sub.
329 // If sub contains op nodes, they all get hoisted up
330 // so that there is never a concat of a concat or an
331 // alternate of an alternate.
332 func (p *parser) collapse(subs []*Regexp, op Op) *Regexp {
336 re := p.newRegexp(op)
338 for _, sub := range subs {
340 re.Sub = append(re.Sub, sub.Sub...)
343 re.Sub = append(re.Sub, sub)
346 if op == OpAlternate {
347 re.Sub = p.factor(re.Sub, re.Flags)
348 if len(re.Sub) == 1 {
357 // factor factors common prefixes from the alternation list sub.
358 // It returns a replacement list that reuses the same storage and
359 // frees (passes to p.reuse) any removed *Regexps.
362 // ABC|ABD|AEF|BCX|BCY
363 // simplifies by literal prefix extraction to
364 // A(B(C|D)|EF)|BC(X|Y)
365 // which simplifies by character class introduction to
366 // A(B[CD]|EF)|BC[XY]
368 func (p *parser) factor(sub []*Regexp, flags Flags) []*Regexp {
373 // Round 1: Factor out common literal prefixes.
378 for i := 0; i <= len(sub); i++ {
379 // Invariant: the Regexps that were in sub[0:start] have been
380 // used or marked for reuse, and the slice space has been reused
381 // for out (len(out) <= start).
383 // Invariant: sub[start:i] consists of regexps that all begin
384 // with str as modified by strflags.
388 istr, iflags = p.leadingString(sub[i])
389 if iflags == strflags {
391 for same < len(str) && same < len(istr) && str[same] == istr[same] {
395 // Matches at least one rune in current range.
396 // Keep going around.
403 // Found end of a run with common leading literal string:
404 // sub[start:i] all begin with str[0:len(str)], but sub[i]
405 // does not even begin with str[0].
407 // Factor out common string and append factored expression to out.
409 // Nothing to do - run of length 0.
410 } else if i == start+1 {
411 // Just one: don't bother factoring.
412 out = append(out, sub[start])
414 // Construct factored form: prefix(suffix1|suffix2|...)
415 prefix := p.newRegexp(OpLiteral)
416 prefix.Flags = strflags
417 prefix.Rune = append(prefix.Rune[:0], str...)
419 for j := start; j < i; j++ {
420 sub[j] = p.removeLeadingString(sub[j], len(str))
422 suffix := p.collapse(sub[start:i], OpAlternate) // recurse
424 re := p.newRegexp(OpConcat)
425 re.Sub = append(re.Sub[:0], prefix, suffix)
426 out = append(out, re)
429 // Prepare for next iteration.
436 // Round 2: Factor out common complex prefixes,
437 // just the first piece of each concatenation,
438 // whatever it is. This is good enough a lot of the time.
442 for i := 0; i <= len(sub); i++ {
443 // Invariant: the Regexps that were in sub[0:start] have been
444 // used or marked for reuse, and the slice space has been reused
445 // for out (len(out) <= start).
447 // Invariant: sub[start:i] consists of regexps that all begin with ifirst.
450 ifirst = p.leadingRegexp(sub[i])
451 if first != nil && first.Equal(ifirst) {
456 // Found end of a run with common leading regexp:
457 // sub[start:i] all begin with first but sub[i] does not.
459 // Factor out common regexp and append factored expression to out.
461 // Nothing to do - run of length 0.
462 } else if i == start+1 {
463 // Just one: don't bother factoring.
464 out = append(out, sub[start])
466 // Construct factored form: prefix(suffix1|suffix2|...)
468 for j := start; j < i; j++ {
469 reuse := j != start // prefix came from sub[start]
470 sub[j] = p.removeLeadingRegexp(sub[j], reuse)
472 suffix := p.collapse(sub[start:i], OpAlternate) // recurse
474 re := p.newRegexp(OpConcat)
475 re.Sub = append(re.Sub[:0], prefix, suffix)
476 out = append(out, re)
479 // Prepare for next iteration.
485 // Round 3: Collapse runs of single literals into character classes.
488 for i := 0; i <= len(sub); i++ {
489 // Invariant: the Regexps that were in sub[0:start] have been
490 // used or marked for reuse, and the slice space has been reused
491 // for out (len(out) <= start).
493 // Invariant: sub[start:i] consists of regexps that are either
494 // literal runes or character classes.
495 if i < len(sub) && isCharClass(sub[i]) {
499 // sub[i] is not a char or char class;
500 // emit char class for sub[start:i]...
502 // Nothing to do - run of length 0.
503 } else if i == start+1 {
504 out = append(out, sub[start])
506 // Make new char class.
507 // Start with most complex regexp in sub[start].
509 for j := start + 1; j < i; j++ {
510 if sub[max].Op < sub[j].Op || sub[max].Op == sub[j].Op && len(sub[max].Rune) < len(sub[j].Rune) {
514 sub[start], sub[max] = sub[max], sub[start]
516 for j := start + 1; j < i; j++ {
517 mergeCharClass(sub[start], sub[j])
521 out = append(out, sub[start])
524 // ... and then emit sub[i].
526 out = append(out, sub[i])
532 // Round 4: Collapse runs of empty matches into a single empty match.
536 if i+1 < len(sub) && sub[i].Op == OpEmptyMatch && sub[i+1].Op == OpEmptyMatch {
539 out = append(out, sub[i])
546 // leadingString returns the leading literal string that re begins with.
547 // The string refers to storage in re or its children.
548 func (p *parser) leadingString(re *Regexp) ([]rune, Flags) {
549 if re.Op == OpConcat && len(re.Sub) > 0 {
552 if re.Op != OpLiteral {
555 return re.Rune, re.Flags & FoldCase
558 // removeLeadingString removes the first n leading runes
559 // from the beginning of re. It returns the replacement for re.
560 func (p *parser) removeLeadingString(re *Regexp, n int) *Regexp {
561 if re.Op == OpConcat && len(re.Sub) > 0 {
562 // Removing a leading string in a concatenation
563 // might simplify the concatenation.
565 sub = p.removeLeadingString(sub, n)
567 if sub.Op == OpEmptyMatch {
571 // Impossible but handle.
579 copy(re.Sub, re.Sub[1:])
580 re.Sub = re.Sub[:len(re.Sub)-1]
586 if re.Op == OpLiteral {
587 re.Rune = re.Rune[:copy(re.Rune, re.Rune[n:])]
588 if len(re.Rune) == 0 {
595 // leadingRegexp returns the leading regexp that re begins with.
596 // The regexp refers to storage in re or its children.
597 func (p *parser) leadingRegexp(re *Regexp) *Regexp {
598 if re.Op == OpEmptyMatch {
601 if re.Op == OpConcat && len(re.Sub) > 0 {
603 if sub.Op == OpEmptyMatch {
611 // removeLeadingRegexp removes the leading regexp in re.
612 // It returns the replacement for re.
613 // If reuse is true, it passes the removed regexp (if no longer needed) to p.reuse.
614 func (p *parser) removeLeadingRegexp(re *Regexp, reuse bool) *Regexp {
615 if re.Op == OpConcat && len(re.Sub) > 0 {
619 re.Sub = re.Sub[:copy(re.Sub, re.Sub[1:])]
634 return p.newRegexp(OpEmptyMatch)
637 func literalRegexp(s string, flags Flags) *Regexp {
638 re := &Regexp{Op: OpLiteral}
640 re.Rune = re.Rune0[:0] // use local storage for small strings
641 for _, c := range s {
642 if len(re.Rune) >= cap(re.Rune) {
643 // string is too long to fit in Rune0. let Go handle it
647 re.Rune = append(re.Rune, c)
654 // Parse parses a regular expression string s, controlled by the specified
655 // Flags, and returns a regular expression parse tree. The syntax is
656 // described in the top-level comment for package regexp.
657 func Parse(s string, flags Flags) (*Regexp, error) {
658 if flags&Literal != 0 {
659 // Trivial parser for literal string.
660 if err := checkUTF8(s); err != nil {
663 return literalRegexp(s, flags), nil
666 // Otherwise, must do real work.
683 if c, t, err = nextRune(t); err != nil {
689 if p.flags&PerlX != 0 && len(t) >= 2 && t[1] == '?' {
690 // Flag changes and non-capturing groups.
691 if t, err = p.parsePerlFlags(t); err != nil {
697 p.op(opLeftParen).Cap = p.numCap
700 if err = p.parseVerticalBar(); err != nil {
705 if err = p.parseRightParen(); err != nil {
710 if p.flags&OneLine != 0 {
717 if p.flags&OneLine != 0 {
718 p.op(OpEndText).Flags |= WasDollar
724 if p.flags&DotNL != 0 {
731 if t, err = p.parseClass(t); err != nil {
745 if after, err = p.repeat(op, min, max, before, after, lastRepeat); err != nil {
753 min, max, after, ok := p.parseRepeat(t)
755 // If the repeat cannot be parsed, { is a literal.
760 if min < 0 || min > 1000 || max > 1000 || max >= 0 && min > max {
761 // Numbers were too big, or max is present and min > max.
762 return nil, &Error{ErrInvalidRepeatSize, before[:len(before)-len(after)]}
764 if after, err = p.repeat(op, min, max, before, after, lastRepeat); err != nil {
770 if p.flags&PerlX != 0 && len(t) >= 2 {
781 p.op(OpNoWordBoundary)
785 // any byte; not supported
786 return nil, &Error{ErrInvalidEscape, t[:2]}
788 // \Q ... \E: the ... is always literals
790 if i := strings.Index(t, `\E`); i < 0 {
797 p.push(literalRegexp(lit, p.flags))
806 re := p.newRegexp(OpCharClass)
809 // Look for Unicode character group like \p{Han}
810 if len(t) >= 2 && (t[1] == 'p' || t[1] == 'P') {
811 r, rest, err := p.parseUnicodeClass(t, re.Rune0[:0])
823 // Perl character class escape.
824 if r, rest := p.parsePerlClassEscape(t, re.Rune0[:0]); r != nil {
832 // Ordinary single-character escape.
833 if c, t, err = p.parseEscape(t); err != nil {
842 if p.swapVerticalBar() {
844 p.stack = p.stack[:len(p.stack)-1]
850 return nil, &Error{ErrMissingParen, s}
852 return p.stack[0], nil
855 // parseRepeat parses {min} (max=min) or {min,} (max=-1) or {min,max}.
856 // If s is not of that form, it returns ok == false.
857 // If s has the right form but the values are too big, it returns min == -1, ok == true.
858 func (p *parser) parseRepeat(s string) (min, max int, rest string, ok bool) {
859 if s == "" || s[0] != '{' {
864 if min, s, ok1 = p.parseInt(s); !ok1 {
879 } else if max, s, ok1 = p.parseInt(s); !ok1 {
882 // parseInt found too big a number
886 if s == "" || s[0] != '}' {
894 // parsePerlFlags parses a Perl flag setting or non-capturing group or both,
895 // like (?i) or (?: or (?i:. It removes the prefix from s and updates the parse state.
896 // The caller must have ensured that s begins with "(?".
897 func (p *parser) parsePerlFlags(s string) (rest string, err error) {
900 // Check for named captures, first introduced in Python's regexp library.
901 // As usual, there are three slightly different syntaxes:
903 // (?P<name>expr) the original, introduced by Python
904 // (?<name>expr) the .NET alteration, adopted by Perl 5.10
905 // (?'name'expr) another .NET alteration, adopted by Perl 5.10
907 // Perl 5.10 gave in and implemented the Python version too,
908 // but they claim that the last two are the preferred forms.
909 // PCRE and languages based on it (specifically, PHP and Ruby)
910 // support all three as well. EcmaScript 4 uses only the Python form.
912 // In both the open source world (via Code Search) and the
913 // Google source tree, (?P<expr>name) is the dominant form,
914 // so that's the one we implement. One is enough.
915 if len(t) > 4 && t[2] == 'P' && t[3] == '<' {
917 end := strings.IndexRune(t, '>')
919 if err = checkUTF8(t); err != nil {
922 return "", &Error{ErrInvalidNamedCapture, s}
925 capture := t[:end+1] // "(?P<name>"
926 name := t[4:end] // "name"
927 if err = checkUTF8(name); err != nil {
930 if !isValidCaptureName(name) {
931 return "", &Error{ErrInvalidNamedCapture, capture}
934 // Like ordinary capture, but named.
936 re := p.op(opLeftParen)
939 return t[end+1:], nil
942 // Non-capturing group. Might also twiddle Perl flags.
950 if c, t, err = nextRune(t); err != nil {
971 // Switch to negation.
977 // Invert flags so that | above turn into &^ and vice versa.
978 // We'll invert flags again before using it below.
982 // End of flags, starting group or not.
999 return "", &Error{ErrInvalidPerlOp, s[:len(s)-len(t)]}
1002 // isValidCaptureName reports whether name
1003 // is a valid capture name: [A-Za-z0-9_]+.
1004 // PCRE limits names to 32 bytes.
1005 // Python rejects names starting with digits.
1006 // We don't enforce either of those.
1007 func isValidCaptureName(name string) bool {
1011 for _, c := range name {
1012 if c != '_' && !isalnum(c) {
1019 // parseInt parses a decimal integer.
1020 func (p *parser) parseInt(s string) (n int, rest string, ok bool) {
1021 if s == "" || s[0] < '0' || '9' < s[0] {
1024 // Disallow leading zeros.
1025 if len(s) >= 2 && s[0] == '0' && '0' <= s[1] && s[1] <= '9' {
1029 for s != "" && '0' <= s[0] && s[0] <= '9' {
1034 // Have digits, compute value.
1035 t = t[:len(t)-len(s)]
1036 for i := 0; i < len(t); i++ {
1042 n = n*10 + int(t[i]) - '0'
1047 // can this be represented as a character class?
1048 // single-rune literal string, char class, ., and .|\n.
1049 func isCharClass(re *Regexp) bool {
1050 return re.Op == OpLiteral && len(re.Rune) == 1 ||
1051 re.Op == OpCharClass ||
1052 re.Op == OpAnyCharNotNL ||
1057 func matchRune(re *Regexp, r rune) bool {
1060 return len(re.Rune) == 1 && re.Rune[0] == r
1062 for i := 0; i < len(re.Rune); i += 2 {
1063 if re.Rune[i] <= r && r <= re.Rune[i+1] {
1068 case OpAnyCharNotNL:
1076 // parseVerticalBar handles a | in the input.
1077 func (p *parser) parseVerticalBar() error {
1080 // The concatenation we just parsed is on top of the stack.
1081 // If it sits above an opVerticalBar, swap it below
1082 // (things below an opVerticalBar become an alternation).
1083 // Otherwise, push a new vertical bar.
1084 if !p.swapVerticalBar() {
1091 // mergeCharClass makes dst = dst|src.
1092 // The caller must ensure that dst.Op >= src.Op,
1093 // to reduce the amount of copying.
1094 func mergeCharClass(dst, src *Regexp) {
1097 // src doesn't add anything.
1098 case OpAnyCharNotNL:
1100 if matchRune(src, '\n') {
1104 // src is simpler, so either literal or char class
1105 if src.Op == OpLiteral {
1106 dst.Rune = appendLiteral(dst.Rune, src.Rune[0], src.Flags)
1108 dst.Rune = appendClass(dst.Rune, src.Rune)
1112 if src.Rune[0] == dst.Rune[0] && src.Flags == dst.Flags {
1115 dst.Op = OpCharClass
1116 dst.Rune = appendLiteral(dst.Rune[:0], dst.Rune[0], dst.Flags)
1117 dst.Rune = appendLiteral(dst.Rune, src.Rune[0], src.Flags)
1121 // If the top of the stack is an element followed by an opVerticalBar
1122 // swapVerticalBar swaps the two and returns true.
1123 // Otherwise it returns false.
1124 func (p *parser) swapVerticalBar() bool {
1125 // If above and below vertical bar are literal or char class,
1126 // can merge into a single char class.
1128 if n >= 3 && p.stack[n-2].Op == opVerticalBar && isCharClass(p.stack[n-1]) && isCharClass(p.stack[n-3]) {
1131 // Make re3 the more complex of the two.
1132 if re1.Op > re3.Op {
1136 mergeCharClass(re3, re1)
1138 p.stack = p.stack[:n-1]
1145 if re2.Op == opVerticalBar {
1147 // Now out of reach.
1148 // Clean opportunistically.
1149 cleanAlt(p.stack[n-3])
1159 // parseRightParen handles a ) in the input.
1160 func (p *parser) parseRightParen() error {
1162 if p.swapVerticalBar() {
1164 p.stack = p.stack[:len(p.stack)-1]
1170 return &Error{errUnexpectedParen, p.wholeRegexp}
1174 p.stack = p.stack[:n-2]
1175 if re2.Op != opLeftParen {
1176 return &Error{errUnexpectedParen, p.wholeRegexp}
1178 // Restore flags at time of paren.
1181 // Just for grouping.
1185 re2.Sub = re2.Sub0[:1]
1192 // parseEscape parses an escape sequence at the beginning of s
1193 // and returns the rune.
1194 func (p *parser) parseEscape(s string) (r rune, rest string, err error) {
1197 return 0, "", &Error{ErrTrailingBackslash, ""}
1199 c, t, err := nextRune(t)
1207 if c < utf8.RuneSelf && !isalnum(c) {
1208 // Escaped non-word characters are always themselves.
1209 // PCRE is not quite so rigorous: it accepts things like
1210 // \q, but we don't. We once rejected \_, but too many
1211 // programs and people insist on using it, so allow \_.
1216 case '1', '2', '3', '4', '5', '6', '7':
1217 // Single non-zero digit is a backreference; not supported
1218 if t == "" || t[0] < '0' || t[0] > '7' {
1223 // Consume up to three octal digits; already have one.
1225 for i := 1; i < 3; i++ {
1226 if t == "" || t[0] < '0' || t[0] > '7' {
1229 r = r*8 + rune(t[0]) - '0'
1234 // Hexadecimal escapes.
1239 if c, t, err = nextRune(t); err != nil {
1243 // Any number of digits in braces.
1244 // Perl accepts any text at all; it ignores all text
1245 // after the first non-hex digit. We require only hex digits,
1246 // and at least one.
1253 if c, t, err = nextRune(t); err != nil {
1264 if r > unicode.MaxRune {
1275 // Easy case: two hex digits.
1277 if c, t, err = nextRune(t); err != nil {
1284 return x*16 + y, t, nil
1286 // C escapes. There is no case 'b', to avoid misparsing
1287 // the Perl word-boundary \b as the C backspace \b
1288 // when in POSIX mode. In Perl, /\b/ means word-boundary
1289 // but /[\b]/ means backspace. We don't support that.
1290 // If you want a backspace, embed a literal backspace
1291 // character or use \x08.
1305 return 0, "", &Error{ErrInvalidEscape, s[:len(s)-len(t)]}
1308 // parseClassChar parses a character class character at the beginning of s
1310 func (p *parser) parseClassChar(s, wholeClass string) (r rune, rest string, err error) {
1312 return 0, "", &Error{Code: ErrMissingBracket, Expr: wholeClass}
1315 // Allow regular escape sequences even though
1316 // many need not be escaped in this context.
1318 return p.parseEscape(s)
1324 type charGroup struct {
1329 // parsePerlClassEscape parses a leading Perl character class escape like \d
1330 // from the beginning of s. If one is present, it appends the characters to r
1331 // and returns the new slice r and the remainder of the string.
1332 func (p *parser) parsePerlClassEscape(s string, r []rune) (out []rune, rest string) {
1333 if p.flags&PerlX == 0 || len(s) < 2 || s[0] != '\\' {
1336 g := perlGroup[s[0:2]]
1340 return p.appendGroup(r, g), s[2:]
1343 // parseNamedClass parses a leading POSIX named character class like [:alnum:]
1344 // from the beginning of s. If one is present, it appends the characters to r
1345 // and returns the new slice r and the remainder of the string.
1346 func (p *parser) parseNamedClass(s string, r []rune) (out []rune, rest string, err error) {
1347 if len(s) < 2 || s[0] != '[' || s[1] != ':' {
1351 i := strings.Index(s[2:], ":]")
1356 name, s := s[0:i+2], s[i+2:]
1357 g := posixGroup[name]
1359 return nil, "", &Error{ErrInvalidCharRange, name}
1361 return p.appendGroup(r, g), s, nil
1364 func (p *parser) appendGroup(r []rune, g charGroup) []rune {
1365 if p.flags&FoldCase == 0 {
1367 r = appendNegatedClass(r, g.class)
1369 r = appendClass(r, g.class)
1372 tmp := p.tmpClass[:0]
1373 tmp = appendFoldedClass(tmp, g.class)
1375 tmp = cleanClass(&p.tmpClass)
1377 r = appendNegatedClass(r, tmp)
1379 r = appendClass(r, tmp)
1385 var anyTable = &unicode.RangeTable{
1386 R16: []unicode.Range16{{Lo: 0, Hi: 1<<16 - 1, Stride: 1}},
1387 R32: []unicode.Range32{{Lo: 1 << 16, Hi: unicode.MaxRune, Stride: 1}},
1390 // unicodeTable returns the unicode.RangeTable identified by name
1391 // and the table of additional fold-equivalent code points.
1392 func unicodeTable(name string) (*unicode.RangeTable, *unicode.RangeTable) {
1393 // Special case: "Any" means any.
1395 return anyTable, anyTable
1397 if t := unicode.Categories[name]; t != nil {
1398 return t, unicode.FoldCategory[name]
1400 if t := unicode.Scripts[name]; t != nil {
1401 return t, unicode.FoldScript[name]
1406 // parseUnicodeClass parses a leading Unicode character class like \p{Han}
1407 // from the beginning of s. If one is present, it appends the characters to r
1408 // and returns the new slice r and the remainder of the string.
1409 func (p *parser) parseUnicodeClass(s string, r []rune) (out []rune, rest string, err error) {
1410 if p.flags&UnicodeGroups == 0 || len(s) < 2 || s[0] != '\\' || s[1] != 'p' && s[1] != 'P' {
1414 // Committed to parse or return error.
1420 c, t, err := nextRune(t)
1424 var seq, name string
1426 // Single-letter name.
1427 seq = s[:len(s)-len(t)]
1430 // Name is in braces.
1431 end := strings.IndexRune(s, '}')
1433 if err = checkUTF8(s); err != nil {
1436 return nil, "", &Error{ErrInvalidCharRange, s}
1438 seq, t = s[:end+1], s[end+1:]
1440 if err = checkUTF8(name); err != nil {
1445 // Group can have leading negation too. \p{^Han} == \P{Han}, \P{^Han} == \p{Han}.
1446 if name != "" && name[0] == '^' {
1451 tab, fold := unicodeTable(name)
1453 return nil, "", &Error{ErrInvalidCharRange, seq}
1456 if p.flags&FoldCase == 0 || fold == nil {
1458 r = appendTable(r, tab)
1460 r = appendNegatedTable(r, tab)
1463 // Merge and clean tab and fold in a temporary buffer.
1464 // This is necessary for the negative case and just tidy
1465 // for the positive case.
1466 tmp := p.tmpClass[:0]
1467 tmp = appendTable(tmp, tab)
1468 tmp = appendTable(tmp, fold)
1470 tmp = cleanClass(&p.tmpClass)
1472 r = appendClass(r, tmp)
1474 r = appendNegatedClass(r, tmp)
1480 // parseClass parses a character class at the beginning of s
1481 // and pushes it onto the parse stack.
1482 func (p *parser) parseClass(s string) (rest string, err error) {
1483 t := s[1:] // chop [
1484 re := p.newRegexp(OpCharClass)
1486 re.Rune = re.Rune0[:0]
1489 if t != "" && t[0] == '^' {
1493 // If character class does not match \n, add it here,
1494 // so that negation later will do the right thing.
1495 if p.flags&ClassNL == 0 {
1496 re.Rune = append(re.Rune, '\n', '\n')
1501 first := true // ] and - are okay as first char in class
1502 for t == "" || t[0] != ']' || first {
1503 // POSIX: - is only okay unescaped as first or last in class.
1504 // Perl: - is okay anywhere.
1505 if t != "" && t[0] == '-' && p.flags&PerlX == 0 && !first && (len(t) == 1 || t[1] != ']') {
1506 _, size := utf8.DecodeRuneInString(t[1:])
1507 return "", &Error{Code: ErrInvalidCharRange, Expr: t[:1+size]}
1511 // Look for POSIX [:alnum:] etc.
1512 if len(t) > 2 && t[0] == '[' && t[1] == ':' {
1513 nclass, nt, err := p.parseNamedClass(t, class)
1518 class, t = nclass, nt
1523 // Look for Unicode character group like \p{Han}.
1524 nclass, nt, err := p.parseUnicodeClass(t, class)
1529 class, t = nclass, nt
1533 // Look for Perl character class symbols (extension).
1534 if nclass, nt := p.parsePerlClassEscape(t, class); nclass != nil {
1535 class, t = nclass, nt
1539 // Single character or simple range.
1542 if lo, t, err = p.parseClassChar(t, s); err != nil {
1546 // [a-] means (a|-) so check for final ].
1547 if len(t) >= 2 && t[0] == '-' && t[1] != ']' {
1549 if hi, t, err = p.parseClassChar(t, s); err != nil {
1553 rng = rng[:len(rng)-len(t)]
1554 return "", &Error{Code: ErrInvalidCharRange, Expr: rng}
1557 if p.flags&FoldCase == 0 {
1558 class = AppendRange(class, lo, hi)
1560 class = appendFoldedRange(class, lo, hi)
1565 // Use &re.Rune instead of &class to avoid allocation.
1567 class = cleanClass(&re.Rune)
1569 class = negateClass(class)
1576 // cleanClass sorts the ranges (pairs of elements of r),
1577 // merges them, and eliminates duplicates.
1578 func cleanClass(rp *[]rune) []rune {
1580 // Sort by lo increasing, hi decreasing to break ties.
1581 sort.Sort(ranges{rp})
1588 // Merge abutting, overlapping.
1589 w := 2 // write index
1590 for i := 2; i < len(r); i += 2 {
1591 lo, hi := r[i], r[i+1]
1593 // merge with previous range
1599 // new disjoint range
1608 // appendLiteral returns the result of appending the literal x to the class r.
1609 func appendLiteral(r []rune, x rune, flags Flags) []rune {
1610 if flags&FoldCase != 0 {
1611 return appendFoldedRange(r, x, x)
1613 return AppendRange(r, x, x)
1616 // appendRange returns the result of appending the range lo-hi to the class r.
1617 func AppendRange(r []rune, lo, hi rune) []rune {
1618 // Expand last range or next to last range if it overlaps or abuts.
1619 // Checking two ranges helps when appending case-folded
1620 // alphabets, so that one range can be expanding A-Z and the
1621 // other expanding a-z.
1623 for i := 2; i <= 4; i += 2 { // twice, using i=2, i=4
1625 rlo, rhi := r[n-i], r[n-i+1]
1626 if lo <= rhi+1 && rlo <= hi+1 {
1638 return append(r, lo, hi)
1642 // minimum and maximum runes involved in folding.
1643 // checked during test.
1648 // appendFoldedRange returns the result of appending the range lo-hi
1649 // and its case folding-equivalent runes to the class r.
1650 func appendFoldedRange(r []rune, lo, hi rune) []rune {
1652 if lo <= MinFold && hi >= MaxFold {
1653 // Range is full: folding can't add more.
1654 return AppendRange(r, lo, hi)
1656 if hi < MinFold || lo > MaxFold {
1657 // Range is outside folding possibilities.
1658 return AppendRange(r, lo, hi)
1661 // [lo, MinFold-1] needs no folding.
1662 r = AppendRange(r, lo, MinFold-1)
1666 // [MaxFold+1, hi] needs no folding.
1667 r = AppendRange(r, MaxFold+1, hi)
1671 // Brute force. Depend on AppendRange to coalesce ranges on the fly.
1672 for c := lo; c <= hi; c++ {
1673 r = AppendRange(r, c, c)
1674 f := unicode.SimpleFold(c)
1676 r = AppendRange(r, f, f)
1677 f = unicode.SimpleFold(f)
1683 // appendClass returns the result of appending the class x to the class r.
1684 // It assume x is clean.
1685 func appendClass(r []rune, x []rune) []rune {
1686 for i := 0; i < len(x); i += 2 {
1687 r = AppendRange(r, x[i], x[i+1])
1692 // appendFolded returns the result of appending the case folding of the class x to the class r.
1693 func appendFoldedClass(r []rune, x []rune) []rune {
1694 for i := 0; i < len(x); i += 2 {
1695 r = appendFoldedRange(r, x[i], x[i+1])
1700 // appendNegatedClass returns the result of appending the negation of the class x to the class r.
1701 // It assumes x is clean.
1702 func appendNegatedClass(r []rune, x []rune) []rune {
1704 for i := 0; i < len(x); i += 2 {
1705 lo, hi := x[i], x[i+1]
1707 r = AppendRange(r, nextLo, lo-1)
1711 if nextLo <= unicode.MaxRune {
1712 r = AppendRange(r, nextLo, unicode.MaxRune)
1717 // appendTable returns the result of appending x to the class r.
1718 func appendTable(r []rune, x *unicode.RangeTable) []rune {
1719 for _, xr := range x.R16 {
1720 lo, hi, stride := rune(xr.Lo), rune(xr.Hi), rune(xr.Stride)
1722 r = AppendRange(r, lo, hi)
1725 for c := lo; c <= hi; c += stride {
1726 r = AppendRange(r, c, c)
1729 for _, xr := range x.R32 {
1730 lo, hi, stride := rune(xr.Lo), rune(xr.Hi), rune(xr.Stride)
1732 r = AppendRange(r, lo, hi)
1735 for c := lo; c <= hi; c += stride {
1736 r = AppendRange(r, c, c)
1742 // appendNegatedTable returns the result of appending the negation of x to the class r.
1743 func appendNegatedTable(r []rune, x *unicode.RangeTable) []rune {
1744 nextLo := '\u0000' // lo end of next class to add
1745 for _, xr := range x.R16 {
1746 lo, hi, stride := rune(xr.Lo), rune(xr.Hi), rune(xr.Stride)
1749 r = AppendRange(r, nextLo, lo-1)
1754 for c := lo; c <= hi; c += stride {
1756 r = AppendRange(r, nextLo, c-1)
1761 for _, xr := range x.R32 {
1762 lo, hi, stride := rune(xr.Lo), rune(xr.Hi), rune(xr.Stride)
1765 r = AppendRange(r, nextLo, lo-1)
1770 for c := lo; c <= hi; c += stride {
1772 r = AppendRange(r, nextLo, c-1)
1777 if nextLo <= unicode.MaxRune {
1778 r = AppendRange(r, nextLo, unicode.MaxRune)
1783 // negateClass overwrites r and returns r's negation.
1784 // It assumes the class r is already clean.
1785 func negateClass(r []rune) []rune {
1786 nextLo := '\u0000' // lo end of next class to add
1787 w := 0 // write index
1788 for i := 0; i < len(r); i += 2 {
1789 lo, hi := r[i], r[i+1]
1798 if nextLo <= unicode.MaxRune {
1799 // It's possible for the negation to have one more
1800 // range - this one - than the original class, so use append.
1801 r = append(r, nextLo, unicode.MaxRune)
1806 // ranges implements sort.Interface on a []rune.
1807 // The choice of receiver type definition is strange
1808 // but avoids an allocation since we already have
1810 type ranges struct {
1814 func (ra ranges) Less(i, j int) bool {
1818 return p[i] < p[j] || p[i] == p[j] && p[i+1] > p[j+1]
1821 func (ra ranges) Len() int {
1822 return len(*ra.p) / 2
1825 func (ra ranges) Swap(i, j int) {
1829 p[i], p[i+1], p[j], p[j+1] = p[j], p[j+1], p[i], p[i+1]
1832 func checkUTF8(s string) error {
1834 rune, size := utf8.DecodeRuneInString(s)
1835 if rune == utf8.RuneError && size == 1 {
1836 return &Error{Code: ErrInvalidUTF8, Expr: s}
1843 func nextRune(s string) (c rune, t string, err error) {
1844 c, size := utf8.DecodeRuneInString(s)
1845 if c == utf8.RuneError && size == 1 {
1846 return 0, "", &Error{Code: ErrInvalidUTF8, Expr: s}
1848 return c, s[size:], nil
1851 func isalnum(c rune) bool {
1852 return '0' <= c && c <= '9' || 'A' <= c && c <= 'Z' || 'a' <= c && c <= 'z'
1855 func unhex(c rune) rune {
1856 if '0' <= c && c <= '9' {
1859 if 'a' <= c && c <= 'f' {
1862 if 'A' <= c && c <= 'F' {