libgo/go/regexp/regexp.go

   1 // Copyright 2009 The Go Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style
   3 // license that can be found in the LICENSE file.
   4
   5 // Package regexp implements regular expression search.
   6 //
   7 // The syntax of the regular expressions accepted is the same
   8 // general syntax used by Perl, Python, and other languages.
   9 // More precisely, it is the syntax accepted by RE2 and described at
  10 // http://code.google.com/p/re2/wiki/Syntax, except for \C.
  11 // For an overview of the syntax, run
  12 //   godoc regexp/syntax
  13 //
  14 // All characters are UTF-8-encoded code points.
  15 //
  16 // There are 16 methods of Regexp that match a regular expression and identify
  17 // the matched text.  Their names are matched by this regular expression:
  18 //
  19 //      Find(All)?(String)?(Submatch)?(Index)?
  20 //
  21 // If 'All' is present, the routine matches successive non-overlapping
  22 // matches of the entire expression.  Empty matches abutting a preceding
  23 // match are ignored.  The return value is a slice containing the successive
  24 // return values of the corresponding non-'All' routine.  These routines take
  25 // an extra integer argument, n; if n >= 0, the function returns at most n
  26 // matches/submatches.
  27 //
  28 // If 'String' is present, the argument is a string; otherwise it is a slice
  29 // of bytes; return values are adjusted as appropriate.
  30 //
  31 // If 'Submatch' is present, the return value is a slice identifying the
  32 // successive submatches of the expression. Submatches are matches of
  33 // parenthesized subexpressions (also known as capturing groups) within the
  34 // regular expression, numbered from left to right in order of opening
  35 // parenthesis. Submatch 0 is the match of the entire expression, submatch 1
  36 // the match of the first parenthesized subexpression, and so on.
  37 //
  38 // If 'Index' is present, matches and submatches are identified by byte index
  39 // pairs within the input string: result[2*n:2*n+1] identifies the indexes of
  40 // the nth submatch.  The pair for n==0 identifies the match of the entire
  41 // expression.  If 'Index' is not present, the match is identified by the
  42 // text of the match/submatch.  If an index is negative, it means that
  43 // subexpression did not match any string in the input.
  44 //
  45 // There is also a subset of the methods that can be applied to text read
  46 // from a RuneReader:
  47 //
  48 //      MatchReader, FindReaderIndex, FindReaderSubmatchIndex
  49 //
  50 // This set may grow.  Note that regular expression matches may need to
  51 // examine text beyond the text returned by a match, so the methods that
  52 // match text from a RuneReader may read arbitrarily far into the input
  53 // before returning.
  54 //
  55 // (There are a few other methods that do not match this pattern.)
  56 //
  57 package regexp
  58
  59 import (
  60         "bytes"
  61         "io"
  62         "regexp/syntax"
  63         "strconv"
  64         "strings"
  65         "sync"
  66         "unicode"
  67         "unicode/utf8"
  68 )
  69
  70 var debug = false
  71
  72 // Regexp is the representation of a compiled regular expression.
  73 // The public interface is entirely through methods.
  74 // A Regexp is safe for concurrent use by multiple goroutines.
  75 type Regexp struct {
  76         // read-only after Compile
  77         expr           string         // as passed to Compile
  78         prog           *syntax.Prog   // compiled program
  79         prefix         string         // required prefix in unanchored matches
  80         prefixBytes    []byte         // prefix, as a []byte
  81         prefixComplete bool           // prefix is the entire regexp
  82         prefixRune     rune           // first rune in prefix
  83         cond           syntax.EmptyOp // empty-width conditions required at start of match
  84         numSubexp      int
  85         subexpNames    []string
  86         longest        bool
  87
  88         // cache of machines for running regexp
  89         mu      sync.Mutex
  90         machine []*machine
  91 }
  92
  93 // String returns the source text used to compile the regular expression.
  94 func (re *Regexp) String() string {
  95         return re.expr
  96 }
  97
  98 // Compile parses a regular expression and returns, if successful,
  99 // a Regexp object that can be used to match against text.
 100 //
 101 // When matching against text, the regexp returns a match that
 102 // begins as early as possible in the input (leftmost), and among those
 103 // it chooses the one that a backtracking search would have found first.
 104 // This so-called leftmost-first matching is the same semantics
 105 // that Perl, Python, and other implementations use, although this
 106 // package implements it without the expense of backtracking.
 107 // For POSIX leftmost-longest matching, see CompilePOSIX.
 108 func Compile(expr string) (*Regexp, error) {
 109         return compile(expr, syntax.Perl, false)
 110 }
 111
 112 // CompilePOSIX is like Compile but restricts the regular expression
 113 // to POSIX ERE (egrep) syntax and changes the match semantics to
 114 // leftmost-longest.
 115 //
 116 // That is, when matching against text, the regexp returns a match that
 117 // begins as early as possible in the input (leftmost), and among those
 118 // it chooses a match that is as long as possible.
 119 // This so-called leftmost-longest matching is the same semantics
 120 // that early regular expression implementations used and that POSIX
 121 // specifies.
 122 //
 123 // However, there can be multiple leftmost-longest matches, with different
 124 // submatch choices, and here this package diverges from POSIX.
 125 // Among the possible leftmost-longest matches, this package chooses
 126 // the one that a backtracking search would have found first, while POSIX
 127 // specifies that the match be chosen to maximize the length of the first
 128 // subexpression, then the second, and so on from left to right.
 129 // The POSIX rule is computationally prohibitive and not even well-defined.
 130 // See http://swtch.com/~rsc/regexp/regexp2.html#posix for details.
 131 func CompilePOSIX(expr string) (*Regexp, error) {
 132         return compile(expr, syntax.POSIX, true)
 133 }
 134
 135 // Longest makes future searches prefer the leftmost-longest match.
 136 // That is, when matching against text, the regexp returns a match that
 137 // begins as early as possible in the input (leftmost), and among those
 138 // it chooses a match that is as long as possible.
 139 func (re *Regexp) Longest() {
 140         re.longest = true
 141 }
 142
 143 func compile(expr string, mode syntax.Flags, longest bool) (*Regexp, error) {
 144         re, err := syntax.Parse(expr, mode)
 145         if err != nil {
 146                 return nil, err
 147         }
 148         maxCap := re.MaxCap()
 149         capNames := re.CapNames()
 150
 151         re = re.Simplify()
 152         prog, err := syntax.Compile(re)
 153         if err != nil {
 154                 return nil, err
 155         }
 156         regexp := &Regexp{
 157                 expr:        expr,
 158                 prog:        prog,
 159                 numSubexp:   maxCap,
 160                 subexpNames: capNames,
 161                 cond:        prog.StartCond(),
 162                 longest:     longest,
 163         }
 164         regexp.prefix, regexp.prefixComplete = prog.Prefix()
 165         if regexp.prefix != "" {
 166                 // TODO(rsc): Remove this allocation by adding
 167                 // IndexString to package bytes.
 168                 regexp.prefixBytes = []byte(regexp.prefix)
 169                 regexp.prefixRune, _ = utf8.DecodeRuneInString(regexp.prefix)
 170         }
 171         return regexp, nil
 172 }
 173
 174 // get returns a machine to use for matching re.
 175 // It uses the re's machine cache if possible, to avoid
 176 // unnecessary allocation.
 177 func (re *Regexp) get() *machine {
 178         re.mu.Lock()
 179         if n := len(re.machine); n > 0 {
 180                 z := re.machine[n-1]
 181                 re.machine = re.machine[:n-1]
 182                 re.mu.Unlock()
 183                 return z
 184         }
 185         re.mu.Unlock()
 186         z := progMachine(re.prog)
 187         z.re = re
 188         return z
 189 }
 190
 191 // put returns a machine to the re's machine cache.
 192 // There is no attempt to limit the size of the cache, so it will
 193 // grow to the maximum number of simultaneous matches
 194 // run using re.  (The cache empties when re gets garbage collected.)
 195 func (re *Regexp) put(z *machine) {
 196         re.mu.Lock()
 197         re.machine = append(re.machine, z)
 198         re.mu.Unlock()
 199 }
 200
 201 // MustCompile is like Compile but panics if the expression cannot be parsed.
 202 // It simplifies safe initialization of global variables holding compiled regular
 203 // expressions.
 204 func MustCompile(str string) *Regexp {
 205         regexp, error := Compile(str)
 206         if error != nil {
 207                 panic(`regexp: Compile(` + quote(str) + `): ` + error.Error())
 208         }
 209         return regexp
 210 }
 211
 212 // MustCompilePOSIX is like CompilePOSIX but panics if the expression cannot be parsed.
 213 // It simplifies safe initialization of global variables holding compiled regular
 214 // expressions.
 215 func MustCompilePOSIX(str string) *Regexp {
 216         regexp, error := CompilePOSIX(str)
 217         if error != nil {
 218                 panic(`regexp: CompilePOSIX(` + quote(str) + `): ` + error.Error())
 219         }
 220         return regexp
 221 }
 222
 223 func quote(s string) string {
 224         if strconv.CanBackquote(s) {
 225                 return "`" + s + "`"
 226         }
 227         return strconv.Quote(s)
 228 }
 229
 230 // NumSubexp returns the number of parenthesized subexpressions in this Regexp.
 231 func (re *Regexp) NumSubexp() int {
 232         return re.numSubexp
 233 }
 234
 235 // SubexpNames returns the names of the parenthesized subexpressions
 236 // in this Regexp.  The name for the first sub-expression is names[1],
 237 // so that if m is a match slice, the name for m[i] is SubexpNames()[i].
 238 // Since the Regexp as a whole cannot be named, names[0] is always
 239 // the empty string.  The slice should not be modified.
 240 func (re *Regexp) SubexpNames() []string {
 241         return re.subexpNames
 242 }
 243
 244 const endOfText rune = -1
 245
 246 // input abstracts different representations of the input text. It provides
 247 // one-character lookahead.
 248 type input interface {
 249         step(pos int) (r rune, width int) // advance one rune
 250         canCheckPrefix() bool             // can we look ahead without losing info?
 251         hasPrefix(re *Regexp) bool
 252         index(re *Regexp, pos int) int
 253         context(pos int) syntax.EmptyOp
 254 }
 255
 256 // inputString scans a string.
 257 type inputString struct {
 258         str string
 259 }
 260
 261 func (i *inputString) step(pos int) (rune, int) {
 262         if pos < len(i.str) {
 263                 c := i.str[pos]
 264                 if c < utf8.RuneSelf {
 265                         return rune(c), 1
 266                 }
 267                 return utf8.DecodeRuneInString(i.str[pos:])
 268         }
 269         return endOfText, 0
 270 }
 271
 272 func (i *inputString) canCheckPrefix() bool {
 273         return true
 274 }
 275
 276 func (i *inputString) hasPrefix(re *Regexp) bool {
 277         return strings.HasPrefix(i.str, re.prefix)
 278 }
 279
 280 func (i *inputString) index(re *Regexp, pos int) int {
 281         return strings.Index(i.str[pos:], re.prefix)
 282 }
 283
 284 func (i *inputString) context(pos int) syntax.EmptyOp {
 285         r1, r2 := endOfText, endOfText
 286         if pos > 0 && pos <= len(i.str) {
 287                 r1, _ = utf8.DecodeLastRuneInString(i.str[:pos])
 288         }
 289         if pos < len(i.str) {
 290                 r2, _ = utf8.DecodeRuneInString(i.str[pos:])
 291         }
 292         return syntax.EmptyOpContext(r1, r2)
 293 }
 294
 295 // inputBytes scans a byte slice.
 296 type inputBytes struct {
 297         str []byte
 298 }
 299
 300 func (i *inputBytes) step(pos int) (rune, int) {
 301         if pos < len(i.str) {
 302                 c := i.str[pos]
 303                 if c < utf8.RuneSelf {
 304                         return rune(c), 1
 305                 }
 306                 return utf8.DecodeRune(i.str[pos:])
 307         }
 308         return endOfText, 0
 309 }
 310
 311 func (i *inputBytes) canCheckPrefix() bool {
 312         return true
 313 }
 314
 315 func (i *inputBytes) hasPrefix(re *Regexp) bool {
 316         return bytes.HasPrefix(i.str, re.prefixBytes)
 317 }
 318
 319 func (i *inputBytes) index(re *Regexp, pos int) int {
 320         return bytes.Index(i.str[pos:], re.prefixBytes)
 321 }
 322
 323 func (i *inputBytes) context(pos int) syntax.EmptyOp {
 324         r1, r2 := endOfText, endOfText
 325         if pos > 0 && pos <= len(i.str) {
 326                 r1, _ = utf8.DecodeLastRune(i.str[:pos])
 327         }
 328         if pos < len(i.str) {
 329                 r2, _ = utf8.DecodeRune(i.str[pos:])
 330         }
 331         return syntax.EmptyOpContext(r1, r2)
 332 }
 333
 334 // inputReader scans a RuneReader.
 335 type inputReader struct {
 336         r     io.RuneReader
 337         atEOT bool
 338         pos   int
 339 }
 340
 341 func (i *inputReader) step(pos int) (rune, int) {
 342         if !i.atEOT && pos != i.pos {
 343                 return endOfText, 0
 344
 345         }
 346         r, w, err := i.r.ReadRune()
 347         if err != nil {
 348                 i.atEOT = true
 349                 return endOfText, 0
 350         }
 351         i.pos += w
 352         return r, w
 353 }
 354
 355 func (i *inputReader) canCheckPrefix() bool {
 356         return false
 357 }
 358
 359 func (i *inputReader) hasPrefix(re *Regexp) bool {
 360         return false
 361 }
 362
 363 func (i *inputReader) index(re *Regexp, pos int) int {
 364         return -1
 365 }
 366
 367 func (i *inputReader) context(pos int) syntax.EmptyOp {
 368         return 0
 369 }
 370
 371 // LiteralPrefix returns a literal string that must begin any match
 372 // of the regular expression re.  It returns the boolean true if the
 373 // literal string comprises the entire regular expression.
 374 func (re *Regexp) LiteralPrefix() (prefix string, complete bool) {
 375         return re.prefix, re.prefixComplete
 376 }
 377
 378 // MatchReader returns whether the Regexp matches the text read by the
 379 // RuneReader.  The return value is a boolean: true for match, false for no
 380 // match.
 381 func (re *Regexp) MatchReader(r io.RuneReader) bool {
 382         return re.doExecute(r, nil, "", 0, 0) != nil
 383 }
 384
 385 // MatchString returns whether the Regexp matches the string s.
 386 // The return value is a boolean: true for match, false for no match.
 387 func (re *Regexp) MatchString(s string) bool {
 388         return re.doExecute(nil, nil, s, 0, 0) != nil
 389 }
 390
 391 // Match returns whether the Regexp matches the byte slice b.
 392 // The return value is a boolean: true for match, false for no match.
 393 func (re *Regexp) Match(b []byte) bool {
 394         return re.doExecute(nil, b, "", 0, 0) != nil
 395 }
 396
 397 // MatchReader checks whether a textual regular expression matches the text
 398 // read by the RuneReader.  More complicated queries need to use Compile and
 399 // the full Regexp interface.
 400 func MatchReader(pattern string, r io.RuneReader) (matched bool, err error) {
 401         re, err := Compile(pattern)
 402         if err != nil {
 403                 return false, err
 404         }
 405         return re.MatchReader(r), nil
 406 }
 407
 408 // MatchString checks whether a textual regular expression
 409 // matches a string.  More complicated queries need
 410 // to use Compile and the full Regexp interface.
 411 func MatchString(pattern string, s string) (matched bool, err error) {
 412         re, err := Compile(pattern)
 413         if err != nil {
 414                 return false, err
 415         }
 416         return re.MatchString(s), nil
 417 }
 418
 419 // Match checks whether a textual regular expression
 420 // matches a byte slice.  More complicated queries need
 421 // to use Compile and the full Regexp interface.
 422 func Match(pattern string, b []byte) (matched bool, err error) {
 423         re, err := Compile(pattern)
 424         if err != nil {
 425                 return false, err
 426         }
 427         return re.Match(b), nil
 428 }
 429
 430 // ReplaceAllString returns a copy of src, replacing matches of the Regexp
 431 // with the replacement string repl.  Inside repl, $ signs are interpreted as
 432 // in Expand, so for instance $1 represents the text of the first submatch.
 433 func (re *Regexp) ReplaceAllString(src, repl string) string {
 434         n := 2
 435         if strings.Index(repl, "$") >= 0 {
 436                 n = 2 * (re.numSubexp + 1)
 437         }
 438         b := re.replaceAll(nil, src, n, func(dst []byte, match []int) []byte {
 439                 return re.expand(dst, repl, nil, src, match)
 440         })
 441         return string(b)
 442 }
 443
 444 // ReplaceAllStringLiteral returns a copy of src, replacing matches of the Regexp
 445 // with the replacement string repl.  The replacement repl is substituted directly,
 446 // without using Expand.
 447 func (re *Regexp) ReplaceAllLiteralString(src, repl string) string {
 448         return string(re.replaceAll(nil, src, 2, func(dst []byte, match []int) []byte {
 449                 return append(dst, repl...)
 450         }))
 451 }
 452
 453 // ReplaceAllStringFunc returns a copy of src in which all matches of the
 454 // Regexp have been replaced by the return value of function repl applied
 455 // to the matched substring.  The replacement returned by repl is substituted
 456 // directly, without using Expand.
 457 func (re *Regexp) ReplaceAllStringFunc(src string, repl func(string) string) string {
 458         b := re.replaceAll(nil, src, 2, func(dst []byte, match []int) []byte {
 459                 return append(dst, repl(src[match[0]:match[1]])...)
 460         })
 461         return string(b)
 462 }
 463
 464 func (re *Regexp) replaceAll(bsrc []byte, src string, nmatch int, repl func(dst []byte, m []int) []byte) []byte {
 465         lastMatchEnd := 0 // end position of the most recent match
 466         searchPos := 0    // position where we next look for a match
 467         var buf []byte
 468         var endPos int
 469         if bsrc != nil {
 470                 endPos = len(bsrc)
 471         } else {
 472                 endPos = len(src)
 473         }
 474         for searchPos <= endPos {
 475                 a := re.doExecute(nil, bsrc, src, searchPos, nmatch)
 476                 if len(a) == 0 {
 477                         break // no more matches
 478                 }
 479
 480                 // Copy the unmatched characters before this match.
 481                 if bsrc != nil {
 482                         buf = append(buf, bsrc[lastMatchEnd:a[0]]...)
 483                 } else {
 484                         buf = append(buf, src[lastMatchEnd:a[0]]...)
 485                 }
 486
 487                 // Now insert a copy of the replacement string, but not for a
 488                 // match of the empty string immediately after another match.
 489                 // (Otherwise, we get double replacement for patterns that
 490                 // match both empty and nonempty strings.)
 491                 if a[1] > lastMatchEnd || a[0] == 0 {
 492                         buf = repl(buf, a)
 493                 }
 494                 lastMatchEnd = a[1]
 495
 496                 // Advance past this match; always advance at least one character.
 497                 var width int
 498                 if bsrc != nil {
 499                         _, width = utf8.DecodeRune(bsrc[searchPos:])
 500                 } else {
 501                         _, width = utf8.DecodeRuneInString(src[searchPos:])
 502                 }
 503                 if searchPos+width > a[1] {
 504                         searchPos += width
 505                 } else if searchPos+1 > a[1] {
 506                         // This clause is only needed at the end of the input
 507                         // string.  In that case, DecodeRuneInString returns width=0.
 508                         searchPos++
 509                 } else {
 510                         searchPos = a[1]
 511                 }
 512         }
 513
 514         // Copy the unmatched characters after the last match.
 515         if bsrc != nil {
 516                 buf = append(buf, bsrc[lastMatchEnd:]...)
 517         } else {
 518                 buf = append(buf, src[lastMatchEnd:]...)
 519         }
 520
 521         return buf
 522 }
 523
 524 // ReplaceAll returns a copy of src, replacing matches of the Regexp
 525 // with the replacement text repl.  Inside repl, $ signs are interpreted as
 526 // in Expand, so for instance $1 represents the text of the first submatch.
 527 func (re *Regexp) ReplaceAll(src, repl []byte) []byte {
 528         n := 2
 529         if bytes.IndexByte(repl, '$') >= 0 {
 530                 n = 2 * (re.numSubexp + 1)
 531         }
 532         srepl := ""
 533         b := re.replaceAll(src, "", n, func(dst []byte, match []int) []byte {
 534                 if len(srepl) != len(repl) {
 535                         srepl = string(repl)
 536                 }
 537                 return re.expand(dst, srepl, src, "", match)
 538         })
 539         return b
 540 }
 541
 542 // ReplaceAllLiteral returns a copy of src, replacing matches of the Regexp
 543 // with the replacement bytes repl.  The replacement repl is substituted directly,
 544 // without using Expand.
 545 func (re *Regexp) ReplaceAllLiteral(src, repl []byte) []byte {
 546         return re.replaceAll(src, "", 2, func(dst []byte, match []int) []byte {
 547                 return append(dst, repl...)
 548         })
 549 }
 550
 551 // ReplaceAllFunc returns a copy of src in which all matches of the
 552 // Regexp have been replaced by the return value of function repl applied
 553 // to the matched byte slice.  The replacement returned by repl is substituted
 554 // directly, without using Expand.
 555 func (re *Regexp) ReplaceAllFunc(src []byte, repl func([]byte) []byte) []byte {
 556         return re.replaceAll(src, "", 2, func(dst []byte, match []int) []byte {
 557                 return append(dst, repl(src[match[0]:match[1]])...)
 558         })
 559 }
 560
 561 var specialBytes = []byte(`\.+*?()|[]{}^$`)
 562
 563 func special(b byte) bool {
 564         return bytes.IndexByte(specialBytes, b) >= 0
 565 }
 566
 567 // QuoteMeta returns a string that quotes all regular expression metacharacters
 568 // inside the argument text; the returned string is a regular expression matching
 569 // the literal text.  For example, QuoteMeta(`[foo]`) returns `\[foo\]`.
 570 func QuoteMeta(s string) string {
 571         b := make([]byte, 2*len(s))
 572
 573         // A byte loop is correct because all metacharacters are ASCII.
 574         j := 0
 575         for i := 0; i < len(s); i++ {
 576                 if special(s[i]) {
 577                         b[j] = '\\'
 578                         j++
 579                 }
 580                 b[j] = s[i]
 581                 j++
 582         }
 583         return string(b[0:j])
 584 }
 585
 586 // The number of capture values in the program may correspond
 587 // to fewer capturing expressions than are in the regexp.
 588 // For example, "(a){0}" turns into an empty program, so the
 589 // maximum capture in the program is 0 but we need to return
 590 // an expression for \1.  Pad appends -1s to the slice a as needed.
 591 func (re *Regexp) pad(a []int) []int {
 592         if a == nil {
 593                 // No match.
 594                 return nil
 595         }
 596         n := (1 + re.numSubexp) * 2
 597         for len(a) < n {
 598                 a = append(a, -1)
 599         }
 600         return a
 601 }
 602
 603 // Find matches in slice b if b is non-nil, otherwise find matches in string s.
 604 func (re *Regexp) allMatches(s string, b []byte, n int, deliver func([]int)) {
 605         var end int
 606         if b == nil {
 607                 end = len(s)
 608         } else {
 609                 end = len(b)
 610         }
 611
 612         for pos, i, prevMatchEnd := 0, 0, -1; i < n && pos <= end; {
 613                 matches := re.doExecute(nil, b, s, pos, re.prog.NumCap)
 614                 if len(matches) == 0 {
 615                         break
 616                 }
 617
 618                 accept := true
 619                 if matches[1] == pos {
 620                         // We've found an empty match.
 621                         if matches[0] == prevMatchEnd {
 622                                 // We don't allow an empty match right
 623                                 // after a previous match, so ignore it.
 624                                 accept = false
 625                         }
 626                         var width int
 627                         // TODO: use step()
 628                         if b == nil {
 629                                 _, width = utf8.DecodeRuneInString(s[pos:end])
 630                         } else {
 631                                 _, width = utf8.DecodeRune(b[pos:end])
 632                         }
 633                         if width > 0 {
 634                                 pos += width
 635                         } else {
 636                                 pos = end + 1
 637                         }
 638                 } else {
 639                         pos = matches[1]
 640                 }
 641                 prevMatchEnd = matches[1]
 642
 643                 if accept {
 644                         deliver(re.pad(matches))
 645                         i++
 646                 }
 647         }
 648 }
 649
 650 // Find returns a slice holding the text of the leftmost match in b of the regular expression.
 651 // A return value of nil indicates no match.
 652 func (re *Regexp) Find(b []byte) []byte {
 653         a := re.doExecute(nil, b, "", 0, 2)
 654         if a == nil {
 655                 return nil
 656         }
 657         return b[a[0]:a[1]]
 658 }
 659
 660 // FindIndex returns a two-element slice of integers defining the location of
 661 // the leftmost match in b of the regular expression.  The match itself is at
 662 // b[loc[0]:loc[1]].
 663 // A return value of nil indicates no match.
 664 func (re *Regexp) FindIndex(b []byte) (loc []int) {
 665         a := re.doExecute(nil, b, "", 0, 2)
 666         if a == nil {
 667                 return nil
 668         }
 669         return a[0:2]
 670 }
 671
 672 // FindString returns a string holding the text of the leftmost match in s of the regular
 673 // expression.  If there is no match, the return value is an empty string,
 674 // but it will also be empty if the regular expression successfully matches
 675 // an empty string.  Use FindStringIndex or FindStringSubmatch if it is
 676 // necessary to distinguish these cases.
 677 func (re *Regexp) FindString(s string) string {
 678         a := re.doExecute(nil, nil, s, 0, 2)
 679         if a == nil {
 680                 return ""
 681         }
 682         return s[a[0]:a[1]]
 683 }
 684
 685 // FindStringIndex returns a two-element slice of integers defining the
 686 // location of the leftmost match in s of the regular expression.  The match
 687 // itself is at s[loc[0]:loc[1]].
 688 // A return value of nil indicates no match.
 689 func (re *Regexp) FindStringIndex(s string) (loc []int) {
 690         a := re.doExecute(nil, nil, s, 0, 2)
 691         if a == nil {
 692                 return nil
 693         }
 694         return a[0:2]
 695 }
 696
 697 // FindReaderIndex returns a two-element slice of integers defining the
 698 // location of the leftmost match of the regular expression in text read from
 699 // the RuneReader.  The match text was found in the input stream at
 700 // byte offset loc[0] through loc[1]-1.
 701 // A return value of nil indicates no match.
 702 func (re *Regexp) FindReaderIndex(r io.RuneReader) (loc []int) {
 703         a := re.doExecute(r, nil, "", 0, 2)
 704         if a == nil {
 705                 return nil
 706         }
 707         return a[0:2]
 708 }
 709
 710 // FindSubmatch returns a slice of slices holding the text of the leftmost
 711 // match of the regular expression in b and the matches, if any, of its
 712 // subexpressions, as defined by the 'Submatch' descriptions in the package
 713 // comment.
 714 // A return value of nil indicates no match.
 715 func (re *Regexp) FindSubmatch(b []byte) [][]byte {
 716         a := re.doExecute(nil, b, "", 0, re.prog.NumCap)
 717         if a == nil {
 718                 return nil
 719         }
 720         ret := make([][]byte, 1+re.numSubexp)
 721         for i := range ret {
 722                 if 2*i < len(a) && a[2*i] >= 0 {
 723                         ret[i] = b[a[2*i]:a[2*i+1]]
 724                 }
 725         }
 726         return ret
 727 }
 728
 729 // Expand appends template to dst and returns the result; during the
 730 // append, Expand replaces variables in the template with corresponding
 731 // matches drawn from src.  The match slice should have been returned by
 732 // FindSubmatchIndex.
 733 //
 734 // In the template, a variable is denoted by a substring of the form
 735 // $name or ${name}, where name is a non-empty sequence of letters,
 736 // digits, and underscores.  A purely numeric name like $1 refers to
 737 // the submatch with the corresponding index; other names refer to
 738 // capturing parentheses named with the (?P<name>...) syntax.  A
 739 // reference to an out of range or unmatched index or a name that is not
 740 // present in the regular expression is replaced with an empty slice.
 741 //
 742 // In the $name form, name is taken to be as long as possible: $1x is
 743 // equivalent to ${1x}, not ${1}x, and, $10 is equivalent to ${10}, not ${1}0.
 744 //
 745 // To insert a literal $ in the output, use $$ in the template.
 746 func (re *Regexp) Expand(dst []byte, template []byte, src []byte, match []int) []byte {
 747         return re.expand(dst, string(template), src, "", match)
 748 }
 749
 750 // ExpandString is like Expand but the template and source are strings.
 751 // It appends to and returns a byte slice in order to give the calling
 752 // code control over allocation.
 753 func (re *Regexp) ExpandString(dst []byte, template string, src string, match []int) []byte {
 754         return re.expand(dst, template, nil, src, match)
 755 }
 756
 757 func (re *Regexp) expand(dst []byte, template string, bsrc []byte, src string, match []int) []byte {
 758         for len(template) > 0 {
 759                 i := strings.Index(template, "$")
 760                 if i < 0 {
 761                         break
 762                 }
 763                 dst = append(dst, template[:i]...)
 764                 template = template[i:]
 765                 if len(template) > 1 && template[1] == '$' {
 766                         // Treat $$ as $.
 767                         dst = append(dst, '$')
 768                         template = template[2:]
 769                         continue
 770                 }
 771                 name, num, rest, ok := extract(template)
 772                 if !ok {
 773                         // Malformed; treat $ as raw text.
 774                         dst = append(dst, '$')
 775                         template = template[1:]
 776                         continue
 777                 }
 778                 template = rest
 779                 if num >= 0 {
 780                         if 2*num+1 < len(match) && match[2*num] >= 0 {
 781                                 if bsrc != nil {
 782                                         dst = append(dst, bsrc[match[2*num]:match[2*num+1]]...)
 783                                 } else {
 784                                         dst = append(dst, src[match[2*num]:match[2*num+1]]...)
 785                                 }
 786                         }
 787                 } else {
 788                         for i, namei := range re.subexpNames {
 789                                 if name == namei && 2*i+1 < len(match) && match[2*i] >= 0 {
 790                                         if bsrc != nil {
 791                                                 dst = append(dst, bsrc[match[2*i]:match[2*i+1]]...)
 792                                         } else {
 793                                                 dst = append(dst, src[match[2*i]:match[2*i+1]]...)
 794                                         }
 795                                         break
 796                                 }
 797                         }
 798                 }
 799         }
 800         dst = append(dst, template...)
 801         return dst
 802 }
 803
 804 // extract returns the name from a leading "$name" or "${name}" in str.
 805 // If it is a number, extract returns num set to that number; otherwise num = -1.
 806 func extract(str string) (name string, num int, rest string, ok bool) {
 807         if len(str) < 2 || str[0] != '$' {
 808                 return
 809         }
 810         brace := false
 811         if str[1] == '{' {
 812                 brace = true
 813                 str = str[2:]
 814         } else {
 815                 str = str[1:]
 816         }
 817         i := 0
 818         for i < len(str) {
 819                 rune, size := utf8.DecodeRuneInString(str[i:])
 820                 if !unicode.IsLetter(rune) && !unicode.IsDigit(rune) && rune != '_' {
 821                         break
 822                 }
 823                 i += size
 824         }
 825         if i == 0 {
 826                 // empty name is not okay
 827                 return
 828         }
 829         name = str[:i]
 830         if brace {
 831                 if i >= len(str) || str[i] != '}' {
 832                         // missing closing brace
 833                         return
 834                 }
 835                 i++
 836         }
 837
 838         // Parse number.
 839         num = 0
 840         for i := 0; i < len(name); i++ {
 841                 if name[i] < '0' || '9' < name[i] || num >= 1e8 {
 842                         num = -1
 843                         break
 844                 }
 845                 num = num*10 + int(name[i]) - '0'
 846         }
 847         // Disallow leading zeros.
 848         if name[0] == '0' && len(name) > 1 {
 849                 num = -1
 850         }
 851
 852         rest = str[i:]
 853         ok = true
 854         return
 855 }
 856
 857 // FindSubmatchIndex returns a slice holding the index pairs identifying the
 858 // leftmost match of the regular expression in b and the matches, if any, of
 859 // its subexpressions, as defined by the 'Submatch' and 'Index' descriptions
 860 // in the package comment.
 861 // A return value of nil indicates no match.
 862 func (re *Regexp) FindSubmatchIndex(b []byte) []int {
 863         return re.pad(re.doExecute(nil, b, "", 0, re.prog.NumCap))
 864 }
 865
 866 // FindStringSubmatch returns a slice of strings holding the text of the
 867 // leftmost match of the regular expression in s and the matches, if any, of
 868 // its subexpressions, as defined by the 'Submatch' description in the
 869 // package comment.
 870 // A return value of nil indicates no match.
 871 func (re *Regexp) FindStringSubmatch(s string) []string {
 872         a := re.doExecute(nil, nil, s, 0, re.prog.NumCap)
 873         if a == nil {
 874                 return nil
 875         }
 876         ret := make([]string, 1+re.numSubexp)
 877         for i := range ret {
 878                 if 2*i < len(a) && a[2*i] >= 0 {
 879                         ret[i] = s[a[2*i]:a[2*i+1]]
 880                 }
 881         }
 882         return ret
 883 }
 884
 885 // FindStringSubmatchIndex returns a slice holding the index pairs
 886 // identifying the leftmost match of the regular expression in s and the
 887 // matches, if any, of its subexpressions, as defined by the 'Submatch' and
 888 // 'Index' descriptions in the package comment.
 889 // A return value of nil indicates no match.
 890 func (re *Regexp) FindStringSubmatchIndex(s string) []int {
 891         return re.pad(re.doExecute(nil, nil, s, 0, re.prog.NumCap))
 892 }
 893
 894 // FindReaderSubmatchIndex returns a slice holding the index pairs
 895 // identifying the leftmost match of the regular expression of text read by
 896 // the RuneReader, and the matches, if any, of its subexpressions, as defined
 897 // by the 'Submatch' and 'Index' descriptions in the package comment.  A
 898 // return value of nil indicates no match.
 899 func (re *Regexp) FindReaderSubmatchIndex(r io.RuneReader) []int {
 900         return re.pad(re.doExecute(r, nil, "", 0, re.prog.NumCap))
 901 }
 902
 903 const startSize = 10 // The size at which to start a slice in the 'All' routines.
 904
 905 // FindAll is the 'All' version of Find; it returns a slice of all successive
 906 // matches of the expression, as defined by the 'All' description in the
 907 // package comment.
 908 // A return value of nil indicates no match.
 909 func (re *Regexp) FindAll(b []byte, n int) [][]byte {
 910         if n < 0 {
 911                 n = len(b) + 1
 912         }
 913         result := make([][]byte, 0, startSize)
 914         re.allMatches("", b, n, func(match []int) {
 915                 result = append(result, b[match[0]:match[1]])
 916         })
 917         if len(result) == 0 {
 918                 return nil
 919         }
 920         return result
 921 }
 922
 923 // FindAllIndex is the 'All' version of FindIndex; it returns a slice of all
 924 // successive matches of the expression, as defined by the 'All' description
 925 // in the package comment.
 926 // A return value of nil indicates no match.
 927 func (re *Regexp) FindAllIndex(b []byte, n int) [][]int {
 928         if n < 0 {
 929                 n = len(b) + 1
 930         }
 931         result := make([][]int, 0, startSize)
 932         re.allMatches("", b, n, func(match []int) {
 933                 result = append(result, match[0:2])
 934         })
 935         if len(result) == 0 {
 936                 return nil
 937         }
 938         return result
 939 }
 940
 941 // FindAllString is the 'All' version of FindString; it returns a slice of all
 942 // successive matches of the expression, as defined by the 'All' description
 943 // in the package comment.
 944 // A return value of nil indicates no match.
 945 func (re *Regexp) FindAllString(s string, n int) []string {
 946         if n < 0 {
 947                 n = len(s) + 1
 948         }
 949         result := make([]string, 0, startSize)
 950         re.allMatches(s, nil, n, func(match []int) {
 951                 result = append(result, s[match[0]:match[1]])
 952         })
 953         if len(result) == 0 {
 954                 return nil
 955         }
 956         return result
 957 }
 958
 959 // FindAllStringIndex is the 'All' version of FindStringIndex; it returns a
 960 // slice of all successive matches of the expression, as defined by the 'All'
 961 // description in the package comment.
 962 // A return value of nil indicates no match.
 963 func (re *Regexp) FindAllStringIndex(s string, n int) [][]int {
 964         if n < 0 {
 965                 n = len(s) + 1
 966         }
 967         result := make([][]int, 0, startSize)
 968         re.allMatches(s, nil, n, func(match []int) {
 969                 result = append(result, match[0:2])
 970         })
 971         if len(result) == 0 {
 972                 return nil
 973         }
 974         return result
 975 }
 976
 977 // FindAllSubmatch is the 'All' version of FindSubmatch; it returns a slice
 978 // of all successive matches of the expression, as defined by the 'All'
 979 // description in the package comment.
 980 // A return value of nil indicates no match.
 981 func (re *Regexp) FindAllSubmatch(b []byte, n int) [][][]byte {
 982         if n < 0 {
 983                 n = len(b) + 1
 984         }
 985         result := make([][][]byte, 0, startSize)
 986         re.allMatches("", b, n, func(match []int) {
 987                 slice := make([][]byte, len(match)/2)
 988                 for j := range slice {
 989                         if match[2*j] >= 0 {
 990                                 slice[j] = b[match[2*j]:match[2*j+1]]
 991                         }
 992                 }
 993                 result = append(result, slice)
 994         })
 995         if len(result) == 0 {
 996                 return nil
 997         }
 998         return result
 999 }
1000
1001 // FindAllSubmatchIndex is the 'All' version of FindSubmatchIndex; it returns
1002 // a slice of all successive matches of the expression, as defined by the
1003 // 'All' description in the package comment.
1004 // A return value of nil indicates no match.
1005 func (re *Regexp) FindAllSubmatchIndex(b []byte, n int) [][]int {
1006         if n < 0 {
1007                 n = len(b) + 1
1008         }
1009         result := make([][]int, 0, startSize)
1010         re.allMatches("", b, n, func(match []int) {
1011                 result = append(result, match)
1012         })
1013         if len(result) == 0 {
1014                 return nil
1015         }
1016         return result
1017 }
1018
1019 // FindAllStringSubmatch is the 'All' version of FindStringSubmatch; it
1020 // returns a slice of all successive matches of the expression, as defined by
1021 // the 'All' description in the package comment.
1022 // A return value of nil indicates no match.
1023 func (re *Regexp) FindAllStringSubmatch(s string, n int) [][]string {
1024         if n < 0 {
1025                 n = len(s) + 1
1026         }
1027         result := make([][]string, 0, startSize)
1028         re.allMatches(s, nil, n, func(match []int) {
1029                 slice := make([]string, len(match)/2)
1030                 for j := range slice {
1031                         if match[2*j] >= 0 {
1032                                 slice[j] = s[match[2*j]:match[2*j+1]]
1033                         }
1034                 }
1035                 result = append(result, slice)
1036         })
1037         if len(result) == 0 {
1038                 return nil
1039         }
1040         return result
1041 }
1042
1043 // FindAllStringSubmatchIndex is the 'All' version of
1044 // FindStringSubmatchIndex; it returns a slice of all successive matches of
1045 // the expression, as defined by the 'All' description in the package
1046 // comment.
1047 // A return value of nil indicates no match.
1048 func (re *Regexp) FindAllStringSubmatchIndex(s string, n int) [][]int {
1049         if n < 0 {
1050                 n = len(s) + 1
1051         }
1052         result := make([][]int, 0, startSize)
1053         re.allMatches(s, nil, n, func(match []int) {
1054                 result = append(result, match)
1055         })
1056         if len(result) == 0 {
1057                 return nil
1058         }
1059         return result
1060 }
1061
1062 // Split slices s into substrings separated by the expression and returns a slice of
1063 // the substrings between those expression matches.
1064 //
1065 // The slice returned by this method consists of all the substrings of s
1066 // not contained in the slice returned by FindAllString. When called on an expression
1067 // that contains no metacharacters, it is equivalent to strings.SplitN.
1068 //
1069 // Example:
1070 //   s := regexp.MustCompile("a*").Split("abaabaccadaaae", 5)
1071 //   // s: ["", "b", "b", "c", "cadaaae"]
1072 //
1073 // The count determines the number of substrings to return:
1074 //   n > 0: at most n substrings; the last substring will be the unsplit remainder.
1075 //   n == 0: the result is nil (zero substrings)
1076 //   n < 0: all substrings
1077 func (re *Regexp) Split(s string, n int) []string {
1078
1079         if n == 0 {
1080                 return nil
1081         }
1082
1083         if len(re.expr) > 0 && len(s) == 0 {
1084                 return []string{""}
1085         }
1086
1087         matches := re.FindAllStringIndex(s, n)
1088         strings := make([]string, 0, len(matches))
1089
1090         beg := 0
1091         end := 0
1092         for _, match := range matches {
1093                 if n > 0 && len(strings) >= n-1 {
1094                         break
1095                 }
1096
1097                 end = match[0]
1098                 if match[1] != 0 {
1099                         strings = append(strings, s[beg:end])
1100                 }
1101                 beg = match[1]
1102         }
1103
1104         if end != len(s) {
1105                 strings = append(strings, s[beg:])
1106         }
1107
1108         return strings
1109 }