libgo/go/regexp/regexp.go

   1 // Copyright 2009 The Go Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style
   3 // license that can be found in the LICENSE file.
   4
   5 // Package regexp implements regular expression search.
   6 //
   7 // The syntax of the regular expressions accepted is the same
   8 // general syntax used by Perl, Python, and other languages.
   9 // More precisely, it is the syntax accepted by RE2 and described at
  10 // http://code.google.com/p/re2/wiki/Syntax, except for \C.
  11 //
  12 // All characters are UTF-8-encoded code points.
  13 //
  14 // There are 16 methods of Regexp that match a regular expression and identify
  15 // the matched text.  Their names are matched by this regular expression:
  16 //
  17 //      Find(All)?(String)?(Submatch)?(Index)?
  18 //
  19 // If 'All' is present, the routine matches successive non-overlapping
  20 // matches of the entire expression.  Empty matches abutting a preceding
  21 // match are ignored.  The return value is a slice containing the successive
  22 // return values of the corresponding non-'All' routine.  These routines take
  23 // an extra integer argument, n; if n >= 0, the function returns at most n
  24 // matches/submatches.
  25 //
  26 // If 'String' is present, the argument is a string; otherwise it is a slice
  27 // of bytes; return values are adjusted as appropriate.
  28 //
  29 // If 'Submatch' is present, the return value is a slice identifying the
  30 // successive submatches of the expression.  Submatches are matches of
  31 // parenthesized subexpressions within the regular expression, numbered from
  32 // left to right in order of opening parenthesis.  Submatch 0 is the match of
  33 // the entire expression, submatch 1 the match of the first parenthesized
  34 // subexpression, and so on.
  35 //
  36 // If 'Index' is present, matches and submatches are identified by byte index
  37 // pairs within the input string: result[2*n:2*n+1] identifies the indexes of
  38 // the nth submatch.  The pair for n==0 identifies the match of the entire
  39 // expression.  If 'Index' is not present, the match is identified by the
  40 // text of the match/submatch.  If an index is negative, it means that
  41 // subexpression did not match any string in the input.
  42 //
  43 // There is also a subset of the methods that can be applied to text read
  44 // from a RuneReader:
  45 //
  46 //      MatchReader, FindReaderIndex, FindReaderSubmatchIndex
  47 //
  48 // This set may grow.  Note that regular expression matches may need to
  49 // examine text beyond the text returned by a match, so the methods that
  50 // match text from a RuneReader may read arbitrarily far into the input
  51 // before returning.
  52 //
  53 // (There are a few other methods that do not match this pattern.)
  54 //
  55 package regexp
  56
  57 import (
  58         "bytes"
  59         "io"
  60         "regexp/syntax"
  61         "strconv"
  62         "strings"
  63         "sync"
  64         "unicode"
  65         "unicode/utf8"
  66 )
  67
  68 var debug = false
  69
  70 // Regexp is the representation of a compiled regular expression.
  71 // The public interface is entirely through methods.
  72 // A Regexp is safe for concurrent use by multiple goroutines.
  73 type Regexp struct {
  74         // read-only after Compile
  75         expr           string         // as passed to Compile
  76         prog           *syntax.Prog   // compiled program
  77         prefix         string         // required prefix in unanchored matches
  78         prefixBytes    []byte         // prefix, as a []byte
  79         prefixComplete bool           // prefix is the entire regexp
  80         prefixRune     rune           // first rune in prefix
  81         cond           syntax.EmptyOp // empty-width conditions required at start of match
  82         numSubexp      int
  83         subexpNames    []string
  84         longest        bool
  85
  86         // cache of machines for running regexp
  87         mu      sync.Mutex
  88         machine []*machine
  89 }
  90
  91 // String returns the source text used to compile the regular expression.
  92 func (re *Regexp) String() string {
  93         return re.expr
  94 }
  95
  96 // Compile parses a regular expression and returns, if successful,
  97 // a Regexp object that can be used to match against text.
  98 //
  99 // When matching against text, the regexp returns a match that
 100 // begins as early as possible in the input (leftmost), and among those
 101 // it chooses the one that a backtracking search would have found first.
 102 // This so-called leftmost-first matching is the same semantics
 103 // that Perl, Python, and other implementations use, although this
 104 // package implements it without the expense of backtracking.
 105 // For POSIX leftmost-longest matching, see CompilePOSIX.
 106 func Compile(expr string) (*Regexp, error) {
 107         return compile(expr, syntax.Perl, false)
 108 }
 109
 110 // CompilePOSIX is like Compile but restricts the regular expression
 111 // to POSIX ERE (egrep) syntax and changes the match semantics to
 112 // leftmost-longest.
 113 //
 114 // That is, when matching against text, the regexp returns a match that
 115 // begins as early as possible in the input (leftmost), and among those
 116 // it chooses a match that is as long as possible.
 117 // This so-called leftmost-longest matching is the same semantics
 118 // that early regular expression implementations used and that POSIX
 119 // specifies.
 120 //
 121 // However, there can be multiple leftmost-longest matches, with different
 122 // submatch choices, and here this package diverges from POSIX.
 123 // Among the possible leftmost-longest matches, this package chooses
 124 // the one that a backtracking search would have found first, while POSIX
 125 // specifies that the match be chosen to maximize the length of the first
 126 // subexpression, then the second, and so on from left to right.
 127 // The POSIX rule is computationally prohibitive and not even well-defined.
 128 // See http://swtch.com/~rsc/regexp/regexp2.html#posix for details.
 129 func CompilePOSIX(expr string) (*Regexp, error) {
 130         return compile(expr, syntax.POSIX, true)
 131 }
 132
 133 func compile(expr string, mode syntax.Flags, longest bool) (*Regexp, error) {
 134         re, err := syntax.Parse(expr, mode)
 135         if err != nil {
 136                 return nil, err
 137         }
 138         maxCap := re.MaxCap()
 139         capNames := re.CapNames()
 140
 141         re = re.Simplify()
 142         prog, err := syntax.Compile(re)
 143         if err != nil {
 144                 return nil, err
 145         }
 146         regexp := &Regexp{
 147                 expr:        expr,
 148                 prog:        prog,
 149                 numSubexp:   maxCap,
 150                 subexpNames: capNames,
 151                 cond:        prog.StartCond(),
 152                 longest:     longest,
 153         }
 154         regexp.prefix, regexp.prefixComplete = prog.Prefix()
 155         if regexp.prefix != "" {
 156                 // TODO(rsc): Remove this allocation by adding
 157                 // IndexString to package bytes.
 158                 regexp.prefixBytes = []byte(regexp.prefix)
 159                 regexp.prefixRune, _ = utf8.DecodeRuneInString(regexp.prefix)
 160         }
 161         return regexp, nil
 162 }
 163
 164 // get returns a machine to use for matching re.
 165 // It uses the re's machine cache if possible, to avoid
 166 // unnecessary allocation.
 167 func (re *Regexp) get() *machine {
 168         re.mu.Lock()
 169         if n := len(re.machine); n > 0 {
 170                 z := re.machine[n-1]
 171                 re.machine = re.machine[:n-1]
 172                 re.mu.Unlock()
 173                 return z
 174         }
 175         re.mu.Unlock()
 176         z := progMachine(re.prog)
 177         z.re = re
 178         return z
 179 }
 180
 181 // put returns a machine to the re's machine cache.
 182 // There is no attempt to limit the size of the cache, so it will
 183 // grow to the maximum number of simultaneous matches
 184 // run using re.  (The cache empties when re gets garbage collected.)
 185 func (re *Regexp) put(z *machine) {
 186         re.mu.Lock()
 187         re.machine = append(re.machine, z)
 188         re.mu.Unlock()
 189 }
 190
 191 // MustCompile is like Compile but panics if the expression cannot be parsed.
 192 // It simplifies safe initialization of global variables holding compiled regular
 193 // expressions.
 194 func MustCompile(str string) *Regexp {
 195         regexp, error := Compile(str)
 196         if error != nil {
 197                 panic(`regexp: Compile(` + quote(str) + `): ` + error.Error())
 198         }
 199         return regexp
 200 }
 201
 202 // MustCompilePOSIX is like CompilePOSIX but panics if the expression cannot be parsed.
 203 // It simplifies safe initialization of global variables holding compiled regular
 204 // expressions.
 205 func MustCompilePOSIX(str string) *Regexp {
 206         regexp, error := CompilePOSIX(str)
 207         if error != nil {
 208                 panic(`regexp: CompilePOSIX(` + quote(str) + `): ` + error.Error())
 209         }
 210         return regexp
 211 }
 212
 213 func quote(s string) string {
 214         if strconv.CanBackquote(s) {
 215                 return "`" + s + "`"
 216         }
 217         return strconv.Quote(s)
 218 }
 219
 220 // NumSubexp returns the number of parenthesized subexpressions in this Regexp.
 221 func (re *Regexp) NumSubexp() int {
 222         return re.numSubexp
 223 }
 224
 225 // SubexpNames returns the names of the parenthesized subexpressions
 226 // in this Regexp.  The name for the first sub-expression is names[1],
 227 // so that if m is a match slice, the name for m[i] is SubexpNames()[i].
 228 // Since the Regexp as a whole cannot be named, names[0] is always
 229 // the empty string.  The slice should not be modified.
 230 func (re *Regexp) SubexpNames() []string {
 231         return re.subexpNames
 232 }
 233
 234 const endOfText rune = -1
 235
 236 // input abstracts different representations of the input text. It provides
 237 // one-character lookahead.
 238 type input interface {
 239         step(pos int) (r rune, width int) // advance one rune
 240         canCheckPrefix() bool             // can we look ahead without losing info?
 241         hasPrefix(re *Regexp) bool
 242         index(re *Regexp, pos int) int
 243         context(pos int) syntax.EmptyOp
 244 }
 245
 246 // inputString scans a string.
 247 type inputString struct {
 248         str string
 249 }
 250
 251 func (i *inputString) step(pos int) (rune, int) {
 252         if pos < len(i.str) {
 253                 c := i.str[pos]
 254                 if c < utf8.RuneSelf {
 255                         return rune(c), 1
 256                 }
 257                 return utf8.DecodeRuneInString(i.str[pos:])
 258         }
 259         return endOfText, 0
 260 }
 261
 262 func (i *inputString) canCheckPrefix() bool {
 263         return true
 264 }
 265
 266 func (i *inputString) hasPrefix(re *Regexp) bool {
 267         return strings.HasPrefix(i.str, re.prefix)
 268 }
 269
 270 func (i *inputString) index(re *Regexp, pos int) int {
 271         return strings.Index(i.str[pos:], re.prefix)
 272 }
 273
 274 func (i *inputString) context(pos int) syntax.EmptyOp {
 275         r1, r2 := endOfText, endOfText
 276         if pos > 0 && pos <= len(i.str) {
 277                 r1, _ = utf8.DecodeLastRuneInString(i.str[:pos])
 278         }
 279         if pos < len(i.str) {
 280                 r2, _ = utf8.DecodeRuneInString(i.str[pos:])
 281         }
 282         return syntax.EmptyOpContext(r1, r2)
 283 }
 284
 285 // inputBytes scans a byte slice.
 286 type inputBytes struct {
 287         str []byte
 288 }
 289
 290 func (i *inputBytes) step(pos int) (rune, int) {
 291         if pos < len(i.str) {
 292                 c := i.str[pos]
 293                 if c < utf8.RuneSelf {
 294                         return rune(c), 1
 295                 }
 296                 return utf8.DecodeRune(i.str[pos:])
 297         }
 298         return endOfText, 0
 299 }
 300
 301 func (i *inputBytes) canCheckPrefix() bool {
 302         return true
 303 }
 304
 305 func (i *inputBytes) hasPrefix(re *Regexp) bool {
 306         return bytes.HasPrefix(i.str, re.prefixBytes)
 307 }
 308
 309 func (i *inputBytes) index(re *Regexp, pos int) int {
 310         return bytes.Index(i.str[pos:], re.prefixBytes)
 311 }
 312
 313 func (i *inputBytes) context(pos int) syntax.EmptyOp {
 314         r1, r2 := endOfText, endOfText
 315         if pos > 0 && pos <= len(i.str) {
 316                 r1, _ = utf8.DecodeLastRune(i.str[:pos])
 317         }
 318         if pos < len(i.str) {
 319                 r2, _ = utf8.DecodeRune(i.str[pos:])
 320         }
 321         return syntax.EmptyOpContext(r1, r2)
 322 }
 323
 324 // inputReader scans a RuneReader.
 325 type inputReader struct {
 326         r     io.RuneReader
 327         atEOT bool
 328         pos   int
 329 }
 330
 331 func (i *inputReader) step(pos int) (rune, int) {
 332         if !i.atEOT && pos != i.pos {
 333                 return endOfText, 0
 334
 335         }
 336         r, w, err := i.r.ReadRune()
 337         if err != nil {
 338                 i.atEOT = true
 339                 return endOfText, 0
 340         }
 341         i.pos += w
 342         return r, w
 343 }
 344
 345 func (i *inputReader) canCheckPrefix() bool {
 346         return false
 347 }
 348
 349 func (i *inputReader) hasPrefix(re *Regexp) bool {
 350         return false
 351 }
 352
 353 func (i *inputReader) index(re *Regexp, pos int) int {
 354         return -1
 355 }
 356
 357 func (i *inputReader) context(pos int) syntax.EmptyOp {
 358         return 0
 359 }
 360
 361 // LiteralPrefix returns a literal string that must begin any match
 362 // of the regular expression re.  It returns the boolean true if the
 363 // literal string comprises the entire regular expression.
 364 func (re *Regexp) LiteralPrefix() (prefix string, complete bool) {
 365         return re.prefix, re.prefixComplete
 366 }
 367
 368 // MatchReader returns whether the Regexp matches the text read by the
 369 // RuneReader.  The return value is a boolean: true for match, false for no
 370 // match.
 371 func (re *Regexp) MatchReader(r io.RuneReader) bool {
 372         return re.doExecute(r, nil, "", 0, 0) != nil
 373 }
 374
 375 // MatchString returns whether the Regexp matches the string s.
 376 // The return value is a boolean: true for match, false for no match.
 377 func (re *Regexp) MatchString(s string) bool {
 378         return re.doExecute(nil, nil, s, 0, 0) != nil
 379 }
 380
 381 // Match returns whether the Regexp matches the byte slice b.
 382 // The return value is a boolean: true for match, false for no match.
 383 func (re *Regexp) Match(b []byte) bool {
 384         return re.doExecute(nil, b, "", 0, 0) != nil
 385 }
 386
 387 // MatchReader checks whether a textual regular expression matches the text
 388 // read by the RuneReader.  More complicated queries need to use Compile and
 389 // the full Regexp interface.
 390 func MatchReader(pattern string, r io.RuneReader) (matched bool, error error) {
 391         re, err := Compile(pattern)
 392         if err != nil {
 393                 return false, err
 394         }
 395         return re.MatchReader(r), nil
 396 }
 397
 398 // MatchString checks whether a textual regular expression
 399 // matches a string.  More complicated queries need
 400 // to use Compile and the full Regexp interface.
 401 func MatchString(pattern string, s string) (matched bool, error error) {
 402         re, err := Compile(pattern)
 403         if err != nil {
 404                 return false, err
 405         }
 406         return re.MatchString(s), nil
 407 }
 408
 409 // Match checks whether a textual regular expression
 410 // matches a byte slice.  More complicated queries need
 411 // to use Compile and the full Regexp interface.
 412 func Match(pattern string, b []byte) (matched bool, error error) {
 413         re, err := Compile(pattern)
 414         if err != nil {
 415                 return false, err
 416         }
 417         return re.Match(b), nil
 418 }
 419
 420 // ReplaceAllString returns a copy of src, replacing matches of the Regexp
 421 // with the replacement string repl.  Inside repl, $ signs are interpreted as
 422 // in Expand, so for instance $1 represents the text of the first submatch.
 423 func (re *Regexp) ReplaceAllString(src, repl string) string {
 424         n := 2
 425         if strings.Index(repl, "$") >= 0 {
 426                 n = 2 * (re.numSubexp + 1)
 427         }
 428         b := re.replaceAll(nil, src, n, func(dst []byte, match []int) []byte {
 429                 return re.expand(dst, repl, nil, src, match)
 430         })
 431         return string(b)
 432 }
 433
 434 // ReplaceAllStringLiteral returns a copy of src, replacing matches of the Regexp
 435 // with the replacement string repl.  The replacement repl is substituted directly,
 436 // without using Expand.
 437 func (re *Regexp) ReplaceAllLiteralString(src, repl string) string {
 438         return string(re.replaceAll(nil, src, 2, func(dst []byte, match []int) []byte {
 439                 return append(dst, repl...)
 440         }))
 441 }
 442
 443 // ReplaceAllStringFunc returns a copy of src in which all matches of the
 444 // Regexp have been replaced by the return value of function repl applied
 445 // to the matched substring.  The replacement returned by repl is substituted
 446 // directly, without using Expand.
 447 func (re *Regexp) ReplaceAllStringFunc(src string, repl func(string) string) string {
 448         b := re.replaceAll(nil, src, 2, func(dst []byte, match []int) []byte {
 449                 return append(dst, repl(src[match[0]:match[1]])...)
 450         })
 451         return string(b)
 452 }
 453
 454 func (re *Regexp) replaceAll(bsrc []byte, src string, nmatch int, repl func(dst []byte, m []int) []byte) []byte {
 455         lastMatchEnd := 0 // end position of the most recent match
 456         searchPos := 0    // position where we next look for a match
 457         var buf []byte
 458         var endPos int
 459         if bsrc != nil {
 460                 endPos = len(bsrc)
 461         } else {
 462                 endPos = len(src)
 463         }
 464         for searchPos <= endPos {
 465                 a := re.doExecute(nil, bsrc, src, searchPos, nmatch)
 466                 if len(a) == 0 {
 467                         break // no more matches
 468                 }
 469
 470                 // Copy the unmatched characters before this match.
 471                 if bsrc != nil {
 472                         buf = append(buf, bsrc[lastMatchEnd:a[0]]...)
 473                 } else {
 474                         buf = append(buf, src[lastMatchEnd:a[0]]...)
 475                 }
 476
 477                 // Now insert a copy of the replacement string, but not for a
 478                 // match of the empty string immediately after another match.
 479                 // (Otherwise, we get double replacement for patterns that
 480                 // match both empty and nonempty strings.)
 481                 if a[1] > lastMatchEnd || a[0] == 0 {
 482                         buf = repl(buf, a)
 483                 }
 484                 lastMatchEnd = a[1]
 485
 486                 // Advance past this match; always advance at least one character.
 487                 var width int
 488                 if bsrc != nil {
 489                         _, width = utf8.DecodeRune(bsrc[searchPos:])
 490                 } else {
 491                         _, width = utf8.DecodeRuneInString(src[searchPos:])
 492                 }
 493                 if searchPos+width > a[1] {
 494                         searchPos += width
 495                 } else if searchPos+1 > a[1] {
 496                         // This clause is only needed at the end of the input
 497                         // string.  In that case, DecodeRuneInString returns width=0.
 498                         searchPos++
 499                 } else {
 500                         searchPos = a[1]
 501                 }
 502         }
 503
 504         // Copy the unmatched characters after the last match.
 505         if bsrc != nil {
 506                 buf = append(buf, bsrc[lastMatchEnd:]...)
 507         } else {
 508                 buf = append(buf, src[lastMatchEnd:]...)
 509         }
 510
 511         return buf
 512 }
 513
 514 // ReplaceAll returns a copy of src, replacing matches of the Regexp
 515 // with the replacement text repl.  Inside repl, $ signs are interpreted as
 516 // in Expand, so for instance $1 represents the text of the first submatch.
 517 func (re *Regexp) ReplaceAll(src, repl []byte) []byte {
 518         n := 2
 519         if bytes.IndexByte(repl, '$') >= 0 {
 520                 n = 2 * (re.numSubexp + 1)
 521         }
 522         srepl := ""
 523         b := re.replaceAll(src, "", n, func(dst []byte, match []int) []byte {
 524                 if len(srepl) != len(repl) {
 525                         srepl = string(repl)
 526                 }
 527                 return re.expand(dst, srepl, src, "", match)
 528         })
 529         return b
 530 }
 531
 532 // ReplaceAllLiteral returns a copy of src, replacing matches of the Regexp
 533 // with the replacement bytes repl.  The replacement repl is substituted directly,
 534 // without using Expand.
 535 func (re *Regexp) ReplaceAllLiteral(src, repl []byte) []byte {
 536         return re.replaceAll(src, "", 2, func(dst []byte, match []int) []byte {
 537                 return append(dst, repl...)
 538         })
 539 }
 540
 541 // ReplaceAllFunc returns a copy of src in which all matches of the
 542 // Regexp have been replaced by the return value of function repl applied
 543 // to the matched byte slice.  The replacement returned by repl is substituted
 544 // directly, without using Expand.
 545 func (re *Regexp) ReplaceAllFunc(src []byte, repl func([]byte) []byte) []byte {
 546         return re.replaceAll(src, "", 2, func(dst []byte, match []int) []byte {
 547                 return append(dst, repl(src[match[0]:match[1]])...)
 548         })
 549 }
 550
 551 var specialBytes = []byte(`\.+*?()|[]{}^$`)
 552
 553 func special(b byte) bool {
 554         return bytes.IndexByte(specialBytes, b) >= 0
 555 }
 556
 557 // QuoteMeta returns a string that quotes all regular expression metacharacters
 558 // inside the argument text; the returned string is a regular expression matching
 559 // the literal text.  For example, QuoteMeta(`[foo]`) returns `\[foo\]`.
 560 func QuoteMeta(s string) string {
 561         b := make([]byte, 2*len(s))
 562
 563         // A byte loop is correct because all metacharacters are ASCII.
 564         j := 0
 565         for i := 0; i < len(s); i++ {
 566                 if special(s[i]) {
 567                         b[j] = '\\'
 568                         j++
 569                 }
 570                 b[j] = s[i]
 571                 j++
 572         }
 573         return string(b[0:j])
 574 }
 575
 576 // The number of capture values in the program may correspond
 577 // to fewer capturing expressions than are in the regexp.
 578 // For example, "(a){0}" turns into an empty program, so the
 579 // maximum capture in the program is 0 but we need to return
 580 // an expression for \1.  Pad appends -1s to the slice a as needed.
 581 func (re *Regexp) pad(a []int) []int {
 582         if a == nil {
 583                 // No match.
 584                 return nil
 585         }
 586         n := (1 + re.numSubexp) * 2
 587         for len(a) < n {
 588                 a = append(a, -1)
 589         }
 590         return a
 591 }
 592
 593 // Find matches in slice b if b is non-nil, otherwise find matches in string s.
 594 func (re *Regexp) allMatches(s string, b []byte, n int, deliver func([]int)) {
 595         var end int
 596         if b == nil {
 597                 end = len(s)
 598         } else {
 599                 end = len(b)
 600         }
 601
 602         for pos, i, prevMatchEnd := 0, 0, -1; i < n && pos <= end; {
 603                 matches := re.doExecute(nil, b, s, pos, re.prog.NumCap)
 604                 if len(matches) == 0 {
 605                         break
 606                 }
 607
 608                 accept := true
 609                 if matches[1] == pos {
 610                         // We've found an empty match.
 611                         if matches[0] == prevMatchEnd {
 612                                 // We don't allow an empty match right
 613                                 // after a previous match, so ignore it.
 614                                 accept = false
 615                         }
 616                         var width int
 617                         // TODO: use step()
 618                         if b == nil {
 619                                 _, width = utf8.DecodeRuneInString(s[pos:end])
 620                         } else {
 621                                 _, width = utf8.DecodeRune(b[pos:end])
 622                         }
 623                         if width > 0 {
 624                                 pos += width
 625                         } else {
 626                                 pos = end + 1
 627                         }
 628                 } else {
 629                         pos = matches[1]
 630                 }
 631                 prevMatchEnd = matches[1]
 632
 633                 if accept {
 634                         deliver(re.pad(matches))
 635                         i++
 636                 }
 637         }
 638 }
 639
 640 // Find returns a slice holding the text of the leftmost match in b of the regular expression.
 641 // A return value of nil indicates no match.
 642 func (re *Regexp) Find(b []byte) []byte {
 643         a := re.doExecute(nil, b, "", 0, 2)
 644         if a == nil {
 645                 return nil
 646         }
 647         return b[a[0]:a[1]]
 648 }
 649
 650 // FindIndex returns a two-element slice of integers defining the location of
 651 // the leftmost match in b of the regular expression.  The match itself is at
 652 // b[loc[0]:loc[1]].
 653 // A return value of nil indicates no match.
 654 func (re *Regexp) FindIndex(b []byte) (loc []int) {
 655         a := re.doExecute(nil, b, "", 0, 2)
 656         if a == nil {
 657                 return nil
 658         }
 659         return a[0:2]
 660 }
 661
 662 // FindString returns a string holding the text of the leftmost match in s of the regular
 663 // expression.  If there is no match, the return value is an empty string,
 664 // but it will also be empty if the regular expression successfully matches
 665 // an empty string.  Use FindStringIndex or FindStringSubmatch if it is
 666 // necessary to distinguish these cases.
 667 func (re *Regexp) FindString(s string) string {
 668         a := re.doExecute(nil, nil, s, 0, 2)
 669         if a == nil {
 670                 return ""
 671         }
 672         return s[a[0]:a[1]]
 673 }
 674
 675 // FindStringIndex returns a two-element slice of integers defining the
 676 // location of the leftmost match in s of the regular expression.  The match
 677 // itself is at s[loc[0]:loc[1]].
 678 // A return value of nil indicates no match.
 679 func (re *Regexp) FindStringIndex(s string) (loc []int) {
 680         a := re.doExecute(nil, nil, s, 0, 2)
 681         if a == nil {
 682                 return nil
 683         }
 684         return a[0:2]
 685 }
 686
 687 // FindReaderIndex returns a two-element slice of integers defining the
 688 // location of the leftmost match of the regular expression in text read from
 689 // the RuneReader.  The match text was found in the input stream at
 690 // byte offset loc[0] through loc[1]-1.
 691 // A return value of nil indicates no match.
 692 func (re *Regexp) FindReaderIndex(r io.RuneReader) (loc []int) {
 693         a := re.doExecute(r, nil, "", 0, 2)
 694         if a == nil {
 695                 return nil
 696         }
 697         return a[0:2]
 698 }
 699
 700 // FindSubmatch returns a slice of slices holding the text of the leftmost
 701 // match of the regular expression in b and the matches, if any, of its
 702 // subexpressions, as defined by the 'Submatch' descriptions in the package
 703 // comment.
 704 // A return value of nil indicates no match.
 705 func (re *Regexp) FindSubmatch(b []byte) [][]byte {
 706         a := re.doExecute(nil, b, "", 0, re.prog.NumCap)
 707         if a == nil {
 708                 return nil
 709         }
 710         ret := make([][]byte, 1+re.numSubexp)
 711         for i := range ret {
 712                 if 2*i < len(a) && a[2*i] >= 0 {
 713                         ret[i] = b[a[2*i]:a[2*i+1]]
 714                 }
 715         }
 716         return ret
 717 }
 718
 719 // Expand appends template to dst and returns the result; during the
 720 // append, Expand replaces variables in the template with corresponding
 721 // matches drawn from src.  The match slice should have been returned by
 722 // FindSubmatchIndex.
 723 //
 724 // In the template, a variable is denoted by a substring of the form
 725 // $name or ${name}, where name is a non-empty sequence of letters,
 726 // digits, and underscores.  A purely numeric name like $1 refers to
 727 // the submatch with the corresponding index; other names refer to
 728 // capturing parentheses named with the (?P<name>...) syntax.  A
 729 // reference to an out of range or unmatched index or a name that is not
 730 // present in the regular expression is replaced with an empty slice.
 731 //
 732 // In the $name form, name is taken to be as long as possible: $1x is
 733 // equivalent to ${1x}, not ${1}x, and, $10 is equivalent to ${10}, not ${1}0.
 734 //
 735 // To insert a literal $ in the output, use $$ in the template.
 736 func (re *Regexp) Expand(dst []byte, template []byte, src []byte, match []int) []byte {
 737         return re.expand(dst, string(template), src, "", match)
 738 }
 739
 740 // ExpandString is like Expand but the template and source are strings.
 741 // It appends to and returns a byte slice in order to give the calling
 742 // code control over allocation.
 743 func (re *Regexp) ExpandString(dst []byte, template string, src string, match []int) []byte {
 744         return re.expand(dst, template, nil, src, match)
 745 }
 746
 747 func (re *Regexp) expand(dst []byte, template string, bsrc []byte, src string, match []int) []byte {
 748         for len(template) > 0 {
 749                 i := strings.Index(template, "$")
 750                 if i < 0 {
 751                         break
 752                 }
 753                 dst = append(dst, template[:i]...)
 754                 template = template[i:]
 755                 if len(template) > 1 && template[1] == '$' {
 756                         // Treat $$ as $.
 757                         dst = append(dst, '$')
 758                         template = template[2:]
 759                         continue
 760                 }
 761                 name, num, rest, ok := extract(template)
 762                 if !ok {
 763                         // Malformed; treat $ as raw text.
 764                         dst = append(dst, '$')
 765                         template = template[1:]
 766                         continue
 767                 }
 768                 template = rest
 769                 if num >= 0 {
 770                         if 2*num+1 < len(match) {
 771                                 if bsrc != nil {
 772                                         dst = append(dst, bsrc[match[2*num]:match[2*num+1]]...)
 773                                 } else {
 774                                         dst = append(dst, src[match[2*num]:match[2*num+1]]...)
 775                                 }
 776                         }
 777                 } else {
 778                         for i, namei := range re.subexpNames {
 779                                 if name == namei && 2*i+1 < len(match) && match[2*i] >= 0 {
 780                                         if bsrc != nil {
 781                                                 dst = append(dst, bsrc[match[2*i]:match[2*i+1]]...)
 782                                         } else {
 783                                                 dst = append(dst, src[match[2*i]:match[2*i+1]]...)
 784                                         }
 785                                         break
 786                                 }
 787                         }
 788                 }
 789         }
 790         dst = append(dst, template...)
 791         return dst
 792 }
 793
 794 // extract returns the name from a leading "$name" or "${name}" in str.
 795 // If it is a number, extract returns num set to that number; otherwise num = -1.
 796 func extract(str string) (name string, num int, rest string, ok bool) {
 797         if len(str) < 2 || str[0] != '$' {
 798                 return
 799         }
 800         brace := false
 801         if str[1] == '{' {
 802                 brace = true
 803                 str = str[2:]
 804         } else {
 805                 str = str[1:]
 806         }
 807         i := 0
 808         for i < len(str) {
 809                 rune, size := utf8.DecodeRuneInString(str[i:])
 810                 if !unicode.IsLetter(rune) && !unicode.IsDigit(rune) && rune != '_' {
 811                         break
 812                 }
 813                 i += size
 814         }
 815         if i == 0 {
 816                 // empty name is not okay
 817                 return
 818         }
 819         name = str[:i]
 820         if brace {
 821                 if i >= len(str) || str[i] != '}' {
 822                         // missing closing brace
 823                         return
 824                 }
 825                 i++
 826         }
 827
 828         // Parse number.
 829         num = 0
 830         for i := 0; i < len(name); i++ {
 831                 if name[i] < '0' || '9' < name[i] || num >= 1e8 {
 832                         num = -1
 833                         break
 834                 }
 835                 num = num*10 + int(name[i]) - '0'
 836         }
 837         // Disallow leading zeros.
 838         if name[0] == '0' && len(name) > 1 {
 839                 num = -1
 840         }
 841
 842         rest = str[i:]
 843         ok = true
 844         return
 845 }
 846
 847 // FindSubmatchIndex returns a slice holding the index pairs identifying the
 848 // leftmost match of the regular expression in b and the matches, if any, of
 849 // its subexpressions, as defined by the 'Submatch' and 'Index' descriptions
 850 // in the package comment.
 851 // A return value of nil indicates no match.
 852 func (re *Regexp) FindSubmatchIndex(b []byte) []int {
 853         return re.pad(re.doExecute(nil, b, "", 0, re.prog.NumCap))
 854 }
 855
 856 // FindStringSubmatch returns a slice of strings holding the text of the
 857 // leftmost match of the regular expression in s and the matches, if any, of
 858 // its subexpressions, as defined by the 'Submatch' description in the
 859 // package comment.
 860 // A return value of nil indicates no match.
 861 func (re *Regexp) FindStringSubmatch(s string) []string {
 862         a := re.doExecute(nil, nil, s, 0, re.prog.NumCap)
 863         if a == nil {
 864                 return nil
 865         }
 866         ret := make([]string, 1+re.numSubexp)
 867         for i := range ret {
 868                 if 2*i < len(a) && a[2*i] >= 0 {
 869                         ret[i] = s[a[2*i]:a[2*i+1]]
 870                 }
 871         }
 872         return ret
 873 }
 874
 875 // FindStringSubmatchIndex returns a slice holding the index pairs
 876 // identifying the leftmost match of the regular expression in s and the
 877 // matches, if any, of its subexpressions, as defined by the 'Submatch' and
 878 // 'Index' descriptions in the package comment.
 879 // A return value of nil indicates no match.
 880 func (re *Regexp) FindStringSubmatchIndex(s string) []int {
 881         return re.pad(re.doExecute(nil, nil, s, 0, re.prog.NumCap))
 882 }
 883
 884 // FindReaderSubmatchIndex returns a slice holding the index pairs
 885 // identifying the leftmost match of the regular expression of text read by
 886 // the RuneReader, and the matches, if any, of its subexpressions, as defined
 887 // by the 'Submatch' and 'Index' descriptions in the package comment.  A
 888 // return value of nil indicates no match.
 889 func (re *Regexp) FindReaderSubmatchIndex(r io.RuneReader) []int {
 890         return re.pad(re.doExecute(r, nil, "", 0, re.prog.NumCap))
 891 }
 892
 893 const startSize = 10 // The size at which to start a slice in the 'All' routines.
 894
 895 // FindAll is the 'All' version of Find; it returns a slice of all successive
 896 // matches of the expression, as defined by the 'All' description in the
 897 // package comment.
 898 // A return value of nil indicates no match.
 899 func (re *Regexp) FindAll(b []byte, n int) [][]byte {
 900         if n < 0 {
 901                 n = len(b) + 1
 902         }
 903         result := make([][]byte, 0, startSize)
 904         re.allMatches("", b, n, func(match []int) {
 905                 result = append(result, b[match[0]:match[1]])
 906         })
 907         if len(result) == 0 {
 908                 return nil
 909         }
 910         return result
 911 }
 912
 913 // FindAllIndex is the 'All' version of FindIndex; it returns a slice of all
 914 // successive matches of the expression, as defined by the 'All' description
 915 // in the package comment.
 916 // A return value of nil indicates no match.
 917 func (re *Regexp) FindAllIndex(b []byte, n int) [][]int {
 918         if n < 0 {
 919                 n = len(b) + 1
 920         }
 921         result := make([][]int, 0, startSize)
 922         re.allMatches("", b, n, func(match []int) {
 923                 result = append(result, match[0:2])
 924         })
 925         if len(result) == 0 {
 926                 return nil
 927         }
 928         return result
 929 }
 930
 931 // FindAllString is the 'All' version of FindString; it returns a slice of all
 932 // successive matches of the expression, as defined by the 'All' description
 933 // in the package comment.
 934 // A return value of nil indicates no match.
 935 func (re *Regexp) FindAllString(s string, n int) []string {
 936         if n < 0 {
 937                 n = len(s) + 1
 938         }
 939         result := make([]string, 0, startSize)
 940         re.allMatches(s, nil, n, func(match []int) {
 941                 result = append(result, s[match[0]:match[1]])
 942         })
 943         if len(result) == 0 {
 944                 return nil
 945         }
 946         return result
 947 }
 948
 949 // FindAllStringIndex is the 'All' version of FindStringIndex; it returns a
 950 // slice of all successive matches of the expression, as defined by the 'All'
 951 // description in the package comment.
 952 // A return value of nil indicates no match.
 953 func (re *Regexp) FindAllStringIndex(s string, n int) [][]int {
 954         if n < 0 {
 955                 n = len(s) + 1
 956         }
 957         result := make([][]int, 0, startSize)
 958         re.allMatches(s, nil, n, func(match []int) {
 959                 result = append(result, match[0:2])
 960         })
 961         if len(result) == 0 {
 962                 return nil
 963         }
 964         return result
 965 }
 966
 967 // FindAllSubmatch is the 'All' version of FindSubmatch; it returns a slice
 968 // of all successive matches of the expression, as defined by the 'All'
 969 // description in the package comment.
 970 // A return value of nil indicates no match.
 971 func (re *Regexp) FindAllSubmatch(b []byte, n int) [][][]byte {
 972         if n < 0 {
 973                 n = len(b) + 1
 974         }
 975         result := make([][][]byte, 0, startSize)
 976         re.allMatches("", b, n, func(match []int) {
 977                 slice := make([][]byte, len(match)/2)
 978                 for j := range slice {
 979                         if match[2*j] >= 0 {
 980                                 slice[j] = b[match[2*j]:match[2*j+1]]
 981                         }
 982                 }
 983                 result = append(result, slice)
 984         })
 985         if len(result) == 0 {
 986                 return nil
 987         }
 988         return result
 989 }
 990
 991 // FindAllSubmatchIndex is the 'All' version of FindSubmatchIndex; it returns
 992 // a slice of all successive matches of the expression, as defined by the
 993 // 'All' description in the package comment.
 994 // A return value of nil indicates no match.
 995 func (re *Regexp) FindAllSubmatchIndex(b []byte, n int) [][]int {
 996         if n < 0 {
 997                 n = len(b) + 1
 998         }
 999         result := make([][]int, 0, startSize)
1000         re.allMatches("", b, n, func(match []int) {
1001                 result = append(result, match)
1002         })
1003         if len(result) == 0 {
1004                 return nil
1005         }
1006         return result
1007 }
1008
1009 // FindAllStringSubmatch is the 'All' version of FindStringSubmatch; it
1010 // returns a slice of all successive matches of the expression, as defined by
1011 // the 'All' description in the package comment.
1012 // A return value of nil indicates no match.
1013 func (re *Regexp) FindAllStringSubmatch(s string, n int) [][]string {
1014         if n < 0 {
1015                 n = len(s) + 1
1016         }
1017         result := make([][]string, 0, startSize)
1018         re.allMatches(s, nil, n, func(match []int) {
1019                 slice := make([]string, len(match)/2)
1020                 for j := range slice {
1021                         if match[2*j] >= 0 {
1022                                 slice[j] = s[match[2*j]:match[2*j+1]]
1023                         }
1024                 }
1025                 result = append(result, slice)
1026         })
1027         if len(result) == 0 {
1028                 return nil
1029         }
1030         return result
1031 }
1032
1033 // FindAllStringSubmatchIndex is the 'All' version of
1034 // FindStringSubmatchIndex; it returns a slice of all successive matches of
1035 // the expression, as defined by the 'All' description in the package
1036 // comment.
1037 // A return value of nil indicates no match.
1038 func (re *Regexp) FindAllStringSubmatchIndex(s string, n int) [][]int {
1039         if n < 0 {
1040                 n = len(s) + 1
1041         }
1042         result := make([][]int, 0, startSize)
1043         re.allMatches(s, nil, n, func(match []int) {
1044                 result = append(result, match)
1045         })
1046         if len(result) == 0 {
1047                 return nil
1048         }
1049         return result
1050 }