libgo/go/strings/strings.go

   1 // Copyright 2009 The Go Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style
   3 // license that can be found in the LICENSE file.
   4
   5 // Package strings implements simple functions to manipulate strings.
   6 package strings
   7
   8 import (
   9         "unicode"
  10         "unicode/utf8"
  11 )
  12
  13 // explode splits s into an array of UTF-8 sequences, one per Unicode character (still strings) up to a maximum of n (n < 0 means no limit).
  14 // Invalid UTF-8 sequences become correct encodings of U+FFF8.
  15 func explode(s string, n int) []string {
  16         if n == 0 {
  17                 return nil
  18         }
  19         l := utf8.RuneCountInString(s)
  20         if n <= 0 || n > l {
  21                 n = l
  22         }
  23         a := make([]string, n)
  24         var size int
  25         var ch rune
  26         i, cur := 0, 0
  27         for ; i+1 < n; i++ {
  28                 ch, size = utf8.DecodeRuneInString(s[cur:])
  29                 a[i] = string(ch)
  30                 cur += size
  31         }
  32         // add the rest, if there is any
  33         if cur < len(s) {
  34                 a[i] = s[cur:]
  35         }
  36         return a
  37 }
  38
  39 // Count counts the number of non-overlapping instances of sep in s.
  40 func Count(s, sep string) int {
  41         if sep == "" {
  42                 return utf8.RuneCountInString(s) + 1
  43         }
  44         c := sep[0]
  45         l := len(sep)
  46         n := 0
  47         if l == 1 {
  48                 // special case worth making fast
  49                 for i := 0; i < len(s); i++ {
  50                         if s[i] == c {
  51                                 n++
  52                         }
  53                 }
  54                 return n
  55         }
  56         for i := 0; i+l <= len(s); i++ {
  57                 if s[i] == c && s[i:i+l] == sep {
  58                         n++
  59                         i += l - 1
  60                 }
  61         }
  62         return n
  63 }
  64
  65 // Contains returns true if substr is within s.
  66 func Contains(s, substr string) bool {
  67         return Index(s, substr) >= 0
  68 }
  69
  70 // ContainsAny returns true if any Unicode code points in chars are within s.
  71 func ContainsAny(s, chars string) bool {
  72         return IndexAny(s, chars) >= 0
  73 }
  74
  75 // ContainsRune returns true if the Unicode code point r is within s.
  76 func ContainsRune(s string, r rune) bool {
  77         return IndexRune(s, r) >= 0
  78 }
  79
  80 // Index returns the index of the first instance of sep in s, or -1 if sep is not present in s.
  81 func Index(s, sep string) int {
  82         n := len(sep)
  83         if n == 0 {
  84                 return 0
  85         }
  86         c := sep[0]
  87         if n == 1 {
  88                 // special case worth making fast
  89                 for i := 0; i < len(s); i++ {
  90                         if s[i] == c {
  91                                 return i
  92                         }
  93                 }
  94                 return -1
  95         }
  96         // n > 1
  97         for i := 0; i+n <= len(s); i++ {
  98                 if s[i] == c && s[i:i+n] == sep {
  99                         return i
 100                 }
 101         }
 102         return -1
 103 }
 104
 105 // LastIndex returns the index of the last instance of sep in s, or -1 if sep is not present in s.
 106 func LastIndex(s, sep string) int {
 107         n := len(sep)
 108         if n == 0 {
 109                 return len(s)
 110         }
 111         c := sep[0]
 112         if n == 1 {
 113                 // special case worth making fast
 114                 for i := len(s) - 1; i >= 0; i-- {
 115                         if s[i] == c {
 116                                 return i
 117                         }
 118                 }
 119                 return -1
 120         }
 121         // n > 1
 122         for i := len(s) - n; i >= 0; i-- {
 123                 if s[i] == c && s[i:i+n] == sep {
 124                         return i
 125                 }
 126         }
 127         return -1
 128 }
 129
 130 // IndexRune returns the index of the first instance of the Unicode code point
 131 // r, or -1 if rune is not present in s.
 132 func IndexRune(s string, r rune) int {
 133         switch {
 134         case r < 0x80:
 135                 b := byte(r)
 136                 for i := 0; i < len(s); i++ {
 137                         if s[i] == b {
 138                                 return i
 139                         }
 140                 }
 141         default:
 142                 for i, c := range s {
 143                         if c == r {
 144                                 return i
 145                         }
 146                 }
 147         }
 148         return -1
 149 }
 150
 151 // IndexAny returns the index of the first instance of any Unicode code point
 152 // from chars in s, or -1 if no Unicode code point from chars is present in s.
 153 func IndexAny(s, chars string) int {
 154         if len(chars) > 0 {
 155                 for i, c := range s {
 156                         for _, m := range chars {
 157                                 if c == m {
 158                                         return i
 159                                 }
 160                         }
 161                 }
 162         }
 163         return -1
 164 }
 165
 166 // LastIndexAny returns the index of the last instance of any Unicode code
 167 // point from chars in s, or -1 if no Unicode code point from chars is
 168 // present in s.
 169 func LastIndexAny(s, chars string) int {
 170         if len(chars) > 0 {
 171                 for i := len(s); i > 0; {
 172                         rune, size := utf8.DecodeLastRuneInString(s[0:i])
 173                         i -= size
 174                         for _, m := range chars {
 175                                 if rune == m {
 176                                         return i
 177                                 }
 178                         }
 179                 }
 180         }
 181         return -1
 182 }
 183
 184 // Generic split: splits after each instance of sep,
 185 // including sepSave bytes of sep in the subarrays.
 186 func genSplit(s, sep string, sepSave, n int) []string {
 187         if n == 0 {
 188                 return nil
 189         }
 190         if sep == "" {
 191                 return explode(s, n)
 192         }
 193         if n < 0 {
 194                 n = Count(s, sep) + 1
 195         }
 196         c := sep[0]
 197         start := 0
 198         a := make([]string, n)
 199         na := 0
 200         for i := 0; i+len(sep) <= len(s) && na+1 < n; i++ {
 201                 if s[i] == c && (len(sep) == 1 || s[i:i+len(sep)] == sep) {
 202                         a[na] = s[start : i+sepSave]
 203                         na++
 204                         start = i + len(sep)
 205                         i += len(sep) - 1
 206                 }
 207         }
 208         a[na] = s[start:]
 209         return a[0 : na+1]
 210 }
 211
 212 // SplitN slices s into substrings separated by sep and returns a slice of
 213 // the substrings between those separators.
 214 // If sep is empty, SplitN splits after each UTF-8 sequence.
 215 // The count determines the number of substrings to return:
 216 //   n > 0: at most n substrings; the last substring will be the unsplit remainder.
 217 //   n == 0: the result is nil (zero substrings)
 218 //   n < 0: all substrings
 219 func SplitN(s, sep string, n int) []string { return genSplit(s, sep, 0, n) }
 220
 221 // SplitAfterN slices s into substrings after each instance of sep and
 222 // returns a slice of those substrings.
 223 // If sep is empty, SplitAfterN splits after each UTF-8 sequence.
 224 // The count determines the number of substrings to return:
 225 //   n > 0: at most n substrings; the last substring will be the unsplit remainder.
 226 //   n == 0: the result is nil (zero substrings)
 227 //   n < 0: all substrings
 228 func SplitAfterN(s, sep string, n int) []string {
 229         return genSplit(s, sep, len(sep), n)
 230 }
 231
 232 // Split slices s into all substrings separated by sep and returns a slice of
 233 // the substrings between those separators.
 234 // If sep is empty, Split splits after each UTF-8 sequence.
 235 // It is equivalent to SplitN with a count of -1.
 236 func Split(s, sep string) []string { return genSplit(s, sep, 0, -1) }
 237
 238 // SplitAfter slices s into all substrings after each instance of sep and
 239 // returns a slice of those substrings.
 240 // If sep is empty, SplitAfter splits after each UTF-8 sequence.
 241 // It is equivalent to SplitAfterN with a count of -1.
 242 func SplitAfter(s, sep string) []string {
 243         return genSplit(s, sep, len(sep), -1)
 244 }
 245
 246 // Fields splits the string s around each instance of one or more consecutive white space
 247 // characters, returning an array of substrings of s or an empty list if s contains only white space.
 248 func Fields(s string) []string {
 249         return FieldsFunc(s, unicode.IsSpace)
 250 }
 251
 252 // FieldsFunc splits the string s at each run of Unicode code points c satisfying f(c)
 253 // and returns an array of slices of s. If all code points in s satisfy f(c) or the
 254 // string is empty, an empty slice is returned.
 255 func FieldsFunc(s string, f func(rune) bool) []string {
 256         // First count the fields.
 257         n := 0
 258         inField := false
 259         for _, rune := range s {
 260                 wasInField := inField
 261                 inField = !f(rune)
 262                 if inField && !wasInField {
 263                         n++
 264                 }
 265         }
 266
 267         // Now create them.
 268         a := make([]string, n)
 269         na := 0
 270         fieldStart := -1 // Set to -1 when looking for start of field.
 271         for i, rune := range s {
 272                 if f(rune) {
 273                         if fieldStart >= 0 {
 274                                 a[na] = s[fieldStart:i]
 275                                 na++
 276                                 fieldStart = -1
 277                         }
 278                 } else if fieldStart == -1 {
 279                         fieldStart = i
 280                 }
 281         }
 282         if fieldStart >= 0 { // Last field might end at EOF.
 283                 a[na] = s[fieldStart:]
 284         }
 285         return a
 286 }
 287
 288 // Join concatenates the elements of a to create a single string.   The separator string
 289 // sep is placed between elements in the resulting string.
 290 func Join(a []string, sep string) string {
 291         if len(a) == 0 {
 292                 return ""
 293         }
 294         if len(a) == 1 {
 295                 return a[0]
 296         }
 297         n := len(sep) * (len(a) - 1)
 298         for i := 0; i < len(a); i++ {
 299                 n += len(a[i])
 300         }
 301
 302         b := make([]byte, n)
 303         bp := copy(b, a[0])
 304         for _, s := range a[1:] {
 305                 bp += copy(b[bp:], sep)
 306                 bp += copy(b[bp:], s)
 307         }
 308         return string(b)
 309 }
 310
 311 // HasPrefix tests whether the string s begins with prefix.
 312 func HasPrefix(s, prefix string) bool {
 313         return len(s) >= len(prefix) && s[0:len(prefix)] == prefix
 314 }
 315
 316 // HasSuffix tests whether the string s ends with suffix.
 317 func HasSuffix(s, suffix string) bool {
 318         return len(s) >= len(suffix) && s[len(s)-len(suffix):] == suffix
 319 }
 320
 321 // Map returns a copy of the string s with all its characters modified
 322 // according to the mapping function. If mapping returns a negative value, the character is
 323 // dropped from the string with no replacement.
 324 func Map(mapping func(rune) rune, s string) string {
 325         // In the worst case, the string can grow when mapped, making
 326         // things unpleasant.  But it's so rare we barge in assuming it's
 327         // fine.  It could also shrink but that falls out naturally.
 328         maxbytes := len(s) // length of b
 329         nbytes := 0        // number of bytes encoded in b
 330         // The output buffer b is initialized on demand, the first
 331         // time a character differs.
 332         var b []byte
 333
 334         for i, c := range s {
 335                 r := mapping(c)
 336                 if b == nil {
 337                         if r == c {
 338                                 continue
 339                         }
 340                         b = make([]byte, maxbytes)
 341                         nbytes = copy(b, s[:i])
 342                 }
 343                 if r >= 0 {
 344                         wid := 1
 345                         if r >= utf8.RuneSelf {
 346                                 wid = utf8.RuneLen(r)
 347                         }
 348                         if nbytes+wid > maxbytes {
 349                                 // Grow the buffer.
 350                                 maxbytes = maxbytes*2 + utf8.UTFMax
 351                                 nb := make([]byte, maxbytes)
 352                                 copy(nb, b[0:nbytes])
 353                                 b = nb
 354                         }
 355                         nbytes += utf8.EncodeRune(b[nbytes:maxbytes], r)
 356                 }
 357         }
 358         if b == nil {
 359                 return s
 360         }
 361         return string(b[0:nbytes])
 362 }
 363
 364 // Repeat returns a new string consisting of count copies of the string s.
 365 func Repeat(s string, count int) string {
 366         b := make([]byte, len(s)*count)
 367         bp := 0
 368         for i := 0; i < count; i++ {
 369                 for j := 0; j < len(s); j++ {
 370                         b[bp] = s[j]
 371                         bp++
 372                 }
 373         }
 374         return string(b)
 375 }
 376
 377 // ToUpper returns a copy of the string s with all Unicode letters mapped to their upper case.
 378 func ToUpper(s string) string { return Map(unicode.ToUpper, s) }
 379
 380 // ToLower returns a copy of the string s with all Unicode letters mapped to their lower case.
 381 func ToLower(s string) string { return Map(unicode.ToLower, s) }
 382
 383 // ToTitle returns a copy of the string s with all Unicode letters mapped to their title case.
 384 func ToTitle(s string) string { return Map(unicode.ToTitle, s) }
 385
 386 // ToUpperSpecial returns a copy of the string s with all Unicode letters mapped to their
 387 // upper case, giving priority to the special casing rules.
 388 func ToUpperSpecial(_case unicode.SpecialCase, s string) string {
 389         return Map(func(r rune) rune { return _case.ToUpper(r) }, s)
 390 }
 391
 392 // ToLowerSpecial returns a copy of the string s with all Unicode letters mapped to their
 393 // lower case, giving priority to the special casing rules.
 394 func ToLowerSpecial(_case unicode.SpecialCase, s string) string {
 395         return Map(func(r rune) rune { return _case.ToLower(r) }, s)
 396 }
 397
 398 // ToTitleSpecial returns a copy of the string s with all Unicode letters mapped to their
 399 // title case, giving priority to the special casing rules.
 400 func ToTitleSpecial(_case unicode.SpecialCase, s string) string {
 401         return Map(func(r rune) rune { return _case.ToTitle(r) }, s)
 402 }
 403
 404 // isSeparator reports whether the rune could mark a word boundary.
 405 // TODO: update when package unicode captures more of the properties.
 406 func isSeparator(r rune) bool {
 407         // ASCII alphanumerics and underscore are not separators
 408         if r <= 0x7F {
 409                 switch {
 410                 case '0' <= r && r <= '9':
 411                         return false
 412                 case 'a' <= r && r <= 'z':
 413                         return false
 414                 case 'A' <= r && r <= 'Z':
 415                         return false
 416                 case r == '_':
 417                         return false
 418                 }
 419                 return true
 420         }
 421         // Letters and digits are not separators
 422         if unicode.IsLetter(r) || unicode.IsDigit(r) {
 423                 return false
 424         }
 425         // Otherwise, all we can do for now is treat spaces as separators.
 426         return unicode.IsSpace(r)
 427 }
 428
 429 // BUG(r): The rule Title uses for word boundaries does not handle Unicode punctuation properly.
 430
 431 // Title returns a copy of the string s with all Unicode letters that begin words
 432 // mapped to their title case.
 433 func Title(s string) string {
 434         // Use a closure here to remember state.
 435         // Hackish but effective. Depends on Map scanning in order and calling
 436         // the closure once per rune.
 437         prev := ' '
 438         return Map(
 439                 func(r rune) rune {
 440                         if isSeparator(prev) {
 441                                 prev = r
 442                                 return unicode.ToTitle(r)
 443                         }
 444                         prev = r
 445                         return r
 446                 },
 447                 s)
 448 }
 449
 450 // TrimLeftFunc returns a slice of the string s with all leading
 451 // Unicode code points c satisfying f(c) removed.
 452 func TrimLeftFunc(s string, f func(rune) bool) string {
 453         i := indexFunc(s, f, false)
 454         if i == -1 {
 455                 return ""
 456         }
 457         return s[i:]
 458 }
 459
 460 // TrimRightFunc returns a slice of the string s with all trailing
 461 // Unicode code points c satisfying f(c) removed.
 462 func TrimRightFunc(s string, f func(rune) bool) string {
 463         i := lastIndexFunc(s, f, false)
 464         if i >= 0 && s[i] >= utf8.RuneSelf {
 465                 _, wid := utf8.DecodeRuneInString(s[i:])
 466                 i += wid
 467         } else {
 468                 i++
 469         }
 470         return s[0:i]
 471 }
 472
 473 // TrimFunc returns a slice of the string s with all leading
 474 // and trailing Unicode code points c satisfying f(c) removed.
 475 func TrimFunc(s string, f func(rune) bool) string {
 476         return TrimRightFunc(TrimLeftFunc(s, f), f)
 477 }
 478
 479 // IndexFunc returns the index into s of the first Unicode
 480 // code point satisfying f(c), or -1 if none do.
 481 func IndexFunc(s string, f func(rune) bool) int {
 482         return indexFunc(s, f, true)
 483 }
 484
 485 // LastIndexFunc returns the index into s of the last
 486 // Unicode code point satisfying f(c), or -1 if none do.
 487 func LastIndexFunc(s string, f func(rune) bool) int {
 488         return lastIndexFunc(s, f, true)
 489 }
 490
 491 // indexFunc is the same as IndexFunc except that if
 492 // truth==false, the sense of the predicate function is
 493 // inverted.
 494 func indexFunc(s string, f func(rune) bool, truth bool) int {
 495         start := 0
 496         for start < len(s) {
 497                 wid := 1
 498                 r := rune(s[start])
 499                 if r >= utf8.RuneSelf {
 500                         r, wid = utf8.DecodeRuneInString(s[start:])
 501                 }
 502                 if f(r) == truth {
 503                         return start
 504                 }
 505                 start += wid
 506         }
 507         return -1
 508 }
 509
 510 // lastIndexFunc is the same as LastIndexFunc except that if
 511 // truth==false, the sense of the predicate function is
 512 // inverted.
 513 func lastIndexFunc(s string, f func(rune) bool, truth bool) int {
 514         for i := len(s); i > 0; {
 515                 r, size := utf8.DecodeLastRuneInString(s[0:i])
 516                 i -= size
 517                 if f(r) == truth {
 518                         return i
 519                 }
 520         }
 521         return -1
 522 }
 523
 524 func makeCutsetFunc(cutset string) func(rune) bool {
 525         return func(r rune) bool { return IndexRune(cutset, r) >= 0 }
 526 }
 527
 528 // Trim returns a slice of the string s with all leading and
 529 // trailing Unicode code points contained in cutset removed.
 530 func Trim(s string, cutset string) string {
 531         if s == "" || cutset == "" {
 532                 return s
 533         }
 534         return TrimFunc(s, makeCutsetFunc(cutset))
 535 }
 536
 537 // TrimLeft returns a slice of the string s with all leading
 538 // Unicode code points contained in cutset removed.
 539 func TrimLeft(s string, cutset string) string {
 540         if s == "" || cutset == "" {
 541                 return s
 542         }
 543         return TrimLeftFunc(s, makeCutsetFunc(cutset))
 544 }
 545
 546 // TrimRight returns a slice of the string s, with all trailing
 547 // Unicode code points contained in cutset removed.
 548 func TrimRight(s string, cutset string) string {
 549         if s == "" || cutset == "" {
 550                 return s
 551         }
 552         return TrimRightFunc(s, makeCutsetFunc(cutset))
 553 }
 554
 555 // TrimSpace returns a slice of the string s, with all leading
 556 // and trailing white space removed, as defined by Unicode.
 557 func TrimSpace(s string) string {
 558         return TrimFunc(s, unicode.IsSpace)
 559 }
 560
 561 // Replace returns a copy of the string s with the first n
 562 // non-overlapping instances of old replaced by new.
 563 // If n < 0, there is no limit on the number of replacements.
 564 func Replace(s, old, new string, n int) string {
 565         if old == new || n == 0 {
 566                 return s // avoid allocation
 567         }
 568
 569         // Compute number of replacements.
 570         if m := Count(s, old); m == 0 {
 571                 return s // avoid allocation
 572         } else if n < 0 || m < n {
 573                 n = m
 574         }
 575
 576         // Apply replacements to buffer.
 577         t := make([]byte, len(s)+n*(len(new)-len(old)))
 578         w := 0
 579         start := 0
 580         for i := 0; i < n; i++ {
 581                 j := start
 582                 if len(old) == 0 {
 583                         if i > 0 {
 584                                 _, wid := utf8.DecodeRuneInString(s[start:])
 585                                 j += wid
 586                         }
 587                 } else {
 588                         j += Index(s[start:], old)
 589                 }
 590                 w += copy(t[w:], s[start:j])
 591                 w += copy(t[w:], new)
 592                 start = j + len(old)
 593         }
 594         w += copy(t[w:], s[start:])
 595         return string(t[0:w])
 596 }
 597
 598 // EqualFold reports whether s and t, interpreted as UTF-8 strings,
 599 // are equal under Unicode case-folding.
 600 func EqualFold(s, t string) bool {
 601         for s != "" && t != "" {
 602                 // Extract first rune from each string.
 603                 var sr, tr rune
 604                 if s[0] < utf8.RuneSelf {
 605                         sr, s = rune(s[0]), s[1:]
 606                 } else {
 607                         r, size := utf8.DecodeRuneInString(s)
 608                         sr, s = r, s[size:]
 609                 }
 610                 if t[0] < utf8.RuneSelf {
 611                         tr, t = rune(t[0]), t[1:]
 612                 } else {
 613                         r, size := utf8.DecodeRuneInString(t)
 614                         tr, t = r, t[size:]
 615                 }
 616
 617                 // If they match, keep going; if not, return false.
 618
 619                 // Easy case.
 620                 if tr == sr {
 621                         continue
 622                 }
 623
 624                 // Make sr < tr to simplify what follows.
 625                 if tr < sr {
 626                         tr, sr = sr, tr
 627                 }
 628                 // Fast check for ASCII.
 629                 if tr < utf8.RuneSelf && 'A' <= sr && sr <= 'Z' {
 630                         // ASCII, and sr is upper case.  tr must be lower case.
 631                         if tr == sr+'a'-'A' {
 632                                 continue
 633                         }
 634                         return false
 635                 }
 636
 637                 // General case.  SimpleFold(x) returns the next equivalent rune > x
 638                 // or wraps around to smaller values.
 639                 r := unicode.SimpleFold(sr)
 640                 for r != sr && r < tr {
 641                         r = unicode.SimpleFold(r)
 642                 }
 643                 if r == tr {
 644                         continue
 645                 }
 646                 return false
 647         }
 648
 649         // One string is empty.  Are both?
 650         return s == t
 651 }