libgo/go/text/template/parse/lex.go

   1 // Copyright 2011 The Go Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style
   3 // license that can be found in the LICENSE file.
   4
   5 package parse
   6
   7 import (
   8         "fmt"
   9         "strings"
  10         "unicode"
  11         "unicode/utf8"
  12 )
  13
  14 // item represents a token or text string returned from the scanner.
  15 type item struct {
  16         typ itemType // The type of this item.
  17         pos Pos      // The starting position, in bytes, of this item in the input string.
  18         val string   // The value of this item.
  19 }
  20
  21 func (i item) String() string {
  22         switch {
  23         case i.typ == itemEOF:
  24                 return "EOF"
  25         case i.typ == itemError:
  26                 return i.val
  27         case i.typ > itemKeyword:
  28                 return fmt.Sprintf("<%s>", i.val)
  29         case len(i.val) > 10:
  30                 return fmt.Sprintf("%.10q...", i.val)
  31         }
  32         return fmt.Sprintf("%q", i.val)
  33 }
  34
  35 // itemType identifies the type of lex items.
  36 type itemType int
  37
  38 const (
  39         itemError        itemType = iota // error occurred; value is text of error
  40         itemBool                         // boolean constant
  41         itemChar                         // printable ASCII character; grab bag for comma etc.
  42         itemCharConstant                 // character constant
  43         itemComplex                      // complex constant (1+2i); imaginary is just a number
  44         itemColonEquals                  // colon-equals (':=') introducing a declaration
  45         itemEOF
  46         itemField      // alphanumeric identifier starting with '.'
  47         itemIdentifier // alphanumeric identifier not starting with '.'
  48         itemLeftDelim  // left action delimiter
  49         itemLeftParen  // '(' inside action
  50         itemNumber     // simple number, including imaginary
  51         itemPipe       // pipe symbol
  52         itemRawString  // raw quoted string (includes quotes)
  53         itemRightDelim // right action delimiter
  54         itemRightParen // ')' inside action
  55         itemSpace      // run of spaces separating arguments
  56         itemString     // quoted string (includes quotes)
  57         itemText       // plain text
  58         itemVariable   // variable starting with '$', such as '$' or  '$1' or '$hello'
  59         // Keywords appear after all the rest.
  60         itemKeyword  // used only to delimit the keywords
  61         itemDot      // the cursor, spelled '.'
  62         itemDefine   // define keyword
  63         itemElse     // else keyword
  64         itemEnd      // end keyword
  65         itemIf       // if keyword
  66         itemNil      // the untyped nil constant, easiest to treat as a keyword
  67         itemRange    // range keyword
  68         itemTemplate // template keyword
  69         itemWith     // with keyword
  70 )
  71
  72 var key = map[string]itemType{
  73         ".":        itemDot,
  74         "define":   itemDefine,
  75         "else":     itemElse,
  76         "end":      itemEnd,
  77         "if":       itemIf,
  78         "range":    itemRange,
  79         "nil":      itemNil,
  80         "template": itemTemplate,
  81         "with":     itemWith,
  82 }
  83
  84 const eof = -1
  85
  86 // stateFn represents the state of the scanner as a function that returns the next state.
  87 type stateFn func(*lexer) stateFn
  88
  89 // lexer holds the state of the scanner.
  90 type lexer struct {
  91         name       string    // the name of the input; used only for error reports
  92         input      string    // the string being scanned
  93         leftDelim  string    // start of action
  94         rightDelim string    // end of action
  95         state      stateFn   // the next lexing function to enter
  96         pos        Pos       // current position in the input
  97         start      Pos       // start position of this item
  98         width      Pos       // width of last rune read from input
  99         lastPos    Pos       // position of most recent item returned by nextItem
 100         items      chan item // channel of scanned items
 101         parenDepth int       // nesting depth of ( ) exprs
 102 }
 103
 104 // next returns the next rune in the input.
 105 func (l *lexer) next() rune {
 106         if int(l.pos) >= len(l.input) {
 107                 l.width = 0
 108                 return eof
 109         }
 110         r, w := utf8.DecodeRuneInString(l.input[l.pos:])
 111         l.width = Pos(w)
 112         l.pos += l.width
 113         return r
 114 }
 115
 116 // peek returns but does not consume the next rune in the input.
 117 func (l *lexer) peek() rune {
 118         r := l.next()
 119         l.backup()
 120         return r
 121 }
 122
 123 // backup steps back one rune. Can only be called once per call of next.
 124 func (l *lexer) backup() {
 125         l.pos -= l.width
 126 }
 127
 128 // emit passes an item back to the client.
 129 func (l *lexer) emit(t itemType) {
 130         l.items <- item{t, l.start, l.input[l.start:l.pos]}
 131         l.start = l.pos
 132 }
 133
 134 // ignore skips over the pending input before this point.
 135 func (l *lexer) ignore() {
 136         l.start = l.pos
 137 }
 138
 139 // accept consumes the next rune if it's from the valid set.
 140 func (l *lexer) accept(valid string) bool {
 141         if strings.IndexRune(valid, l.next()) >= 0 {
 142                 return true
 143         }
 144         l.backup()
 145         return false
 146 }
 147
 148 // acceptRun consumes a run of runes from the valid set.
 149 func (l *lexer) acceptRun(valid string) {
 150         for strings.IndexRune(valid, l.next()) >= 0 {
 151         }
 152         l.backup()
 153 }
 154
 155 // lineNumber reports which line we're on, based on the position of
 156 // the previous item returned by nextItem. Doing it this way
 157 // means we don't have to worry about peek double counting.
 158 func (l *lexer) lineNumber() int {
 159         return 1 + strings.Count(l.input[:l.lastPos], "\n")
 160 }
 161
 162 // errorf returns an error token and terminates the scan by passing
 163 // back a nil pointer that will be the next state, terminating l.nextItem.
 164 func (l *lexer) errorf(format string, args ...interface{}) stateFn {
 165         l.items <- item{itemError, l.start, fmt.Sprintf(format, args...)}
 166         return nil
 167 }
 168
 169 // nextItem returns the next item from the input.
 170 func (l *lexer) nextItem() item {
 171         item := <-l.items
 172         l.lastPos = item.pos
 173         return item
 174 }
 175
 176 // lex creates a new scanner for the input string.
 177 func lex(name, input, left, right string) *lexer {
 178         if left == "" {
 179                 left = leftDelim
 180         }
 181         if right == "" {
 182                 right = rightDelim
 183         }
 184         l := &lexer{
 185                 name:       name,
 186                 input:      input,
 187                 leftDelim:  left,
 188                 rightDelim: right,
 189                 items:      make(chan item),
 190         }
 191         go l.run()
 192         return l
 193 }
 194
 195 // run runs the state machine for the lexer.
 196 func (l *lexer) run() {
 197         for l.state = lexText; l.state != nil; {
 198                 l.state = l.state(l)
 199         }
 200 }
 201
 202 // state functions
 203
 204 const (
 205         leftDelim    = "{{"
 206         rightDelim   = "}}"
 207         leftComment  = "/*"
 208         rightComment = "*/"
 209 )
 210
 211 // lexText scans until an opening action delimiter, "{{".
 212 func lexText(l *lexer) stateFn {
 213         for {
 214                 if strings.HasPrefix(l.input[l.pos:], l.leftDelim) {
 215                         if l.pos > l.start {
 216                                 l.emit(itemText)
 217                         }
 218                         return lexLeftDelim
 219                 }
 220                 if l.next() == eof {
 221                         break
 222                 }
 223         }
 224         // Correctly reached EOF.
 225         if l.pos > l.start {
 226                 l.emit(itemText)
 227         }
 228         l.emit(itemEOF)
 229         return nil
 230 }
 231
 232 // lexLeftDelim scans the left delimiter, which is known to be present.
 233 func lexLeftDelim(l *lexer) stateFn {
 234         l.pos += Pos(len(l.leftDelim))
 235         if strings.HasPrefix(l.input[l.pos:], leftComment) {
 236                 return lexComment
 237         }
 238         l.emit(itemLeftDelim)
 239         l.parenDepth = 0
 240         return lexInsideAction
 241 }
 242
 243 // lexComment scans a comment. The left comment marker is known to be present.
 244 func lexComment(l *lexer) stateFn {
 245         l.pos += Pos(len(leftComment))
 246         i := strings.Index(l.input[l.pos:], rightComment+l.rightDelim)
 247         if i < 0 {
 248                 return l.errorf("unclosed comment")
 249         }
 250         l.pos += Pos(i + len(rightComment) + len(l.rightDelim))
 251         l.ignore()
 252         return lexText
 253 }
 254
 255 // lexRightDelim scans the right delimiter, which is known to be present.
 256 func lexRightDelim(l *lexer) stateFn {
 257         l.pos += Pos(len(l.rightDelim))
 258         l.emit(itemRightDelim)
 259         return lexText
 260 }
 261
 262 // lexInsideAction scans the elements inside action delimiters.
 263 func lexInsideAction(l *lexer) stateFn {
 264         // Either number, quoted string, or identifier.
 265         // Spaces separate arguments; runs of spaces turn into itemSpace.
 266         // Pipe symbols separate and are emitted.
 267         if strings.HasPrefix(l.input[l.pos:], l.rightDelim) {
 268                 if l.parenDepth == 0 {
 269                         return lexRightDelim
 270                 }
 271                 return l.errorf("unclosed left paren")
 272         }
 273         switch r := l.next(); {
 274         case r == eof || isEndOfLine(r):
 275                 return l.errorf("unclosed action")
 276         case isSpace(r):
 277                 return lexSpace
 278         case r == ':':
 279                 if l.next() != '=' {
 280                         return l.errorf("expected :=")
 281                 }
 282                 l.emit(itemColonEquals)
 283         case r == '|':
 284                 l.emit(itemPipe)
 285         case r == '"':
 286                 return lexQuote
 287         case r == '`':
 288                 return lexRawQuote
 289         case r == '$':
 290                 return lexVariable
 291         case r == '\'':
 292                 return lexChar
 293         case r == '.':
 294                 // special look-ahead for ".field" so we don't break l.backup().
 295                 if l.pos < Pos(len(l.input)) {
 296                         r := l.input[l.pos]
 297                         if r < '0' || '9' < r {
 298                                 return lexField
 299                         }
 300                 }
 301                 fallthrough // '.' can start a number.
 302         case r == '+' || r == '-' || ('0' <= r && r <= '9'):
 303                 l.backup()
 304                 return lexNumber
 305         case isAlphaNumeric(r):
 306                 l.backup()
 307                 return lexIdentifier
 308         case r == '(':
 309                 l.emit(itemLeftParen)
 310                 l.parenDepth++
 311                 return lexInsideAction
 312         case r == ')':
 313                 l.emit(itemRightParen)
 314                 l.parenDepth--
 315                 if l.parenDepth < 0 {
 316                         return l.errorf("unexpected right paren %#U", r)
 317                 }
 318                 return lexInsideAction
 319         case r <= unicode.MaxASCII && unicode.IsPrint(r):
 320                 l.emit(itemChar)
 321                 return lexInsideAction
 322         default:
 323                 return l.errorf("unrecognized character in action: %#U", r)
 324         }
 325         return lexInsideAction
 326 }
 327
 328 // lexSpace scans a run of space characters.
 329 // One space has already been seen.
 330 func lexSpace(l *lexer) stateFn {
 331         for isSpace(l.peek()) {
 332                 l.next()
 333         }
 334         l.emit(itemSpace)
 335         return lexInsideAction
 336 }
 337
 338 // lexIdentifier scans an alphanumeric.
 339 func lexIdentifier(l *lexer) stateFn {
 340 Loop:
 341         for {
 342                 switch r := l.next(); {
 343                 case isAlphaNumeric(r):
 344                         // absorb.
 345                 default:
 346                         l.backup()
 347                         word := l.input[l.start:l.pos]
 348                         if !l.atTerminator() {
 349                                 return l.errorf("bad character %#U", r)
 350                         }
 351                         switch {
 352                         case key[word] > itemKeyword:
 353                                 l.emit(key[word])
 354                         case word[0] == '.':
 355                                 l.emit(itemField)
 356                         case word == "true", word == "false":
 357                                 l.emit(itemBool)
 358                         default:
 359                                 l.emit(itemIdentifier)
 360                         }
 361                         break Loop
 362                 }
 363         }
 364         return lexInsideAction
 365 }
 366
 367 // lexField scans a field: .Alphanumeric.
 368 // The . has been scanned.
 369 func lexField(l *lexer) stateFn {
 370         return lexFieldOrVariable(l, itemField)
 371 }
 372
 373 // lexVariable scans a Variable: $Alphanumeric.
 374 // The $ has been scanned.
 375 func lexVariable(l *lexer) stateFn {
 376         if l.atTerminator() { // Nothing interesting follows -> "$".
 377                 l.emit(itemVariable)
 378                 return lexInsideAction
 379         }
 380         return lexFieldOrVariable(l, itemVariable)
 381 }
 382
 383 // lexVariable scans a field or variable: [.$]Alphanumeric.
 384 // The . or $ has been scanned.
 385 func lexFieldOrVariable(l *lexer, typ itemType) stateFn {
 386         if l.atTerminator() { // Nothing interesting follows -> "." or "$".
 387                 if typ == itemVariable {
 388                         l.emit(itemVariable)
 389                 } else {
 390                         l.emit(itemDot)
 391                 }
 392                 return lexInsideAction
 393         }
 394         var r rune
 395         for {
 396                 r = l.next()
 397                 if !isAlphaNumeric(r) {
 398                         l.backup()
 399                         break
 400                 }
 401         }
 402         if !l.atTerminator() {
 403                 return l.errorf("bad character %#U", r)
 404         }
 405         l.emit(typ)
 406         return lexInsideAction
 407 }
 408
 409 // atTerminator reports whether the input is at valid termination character to
 410 // appear after an identifier. Breaks .X.Y into two pieces. Also catches cases
 411 // like "$x+2" not being acceptable without a space, in case we decide one
 412 // day to implement arithmetic.
 413 func (l *lexer) atTerminator() bool {
 414         r := l.peek()
 415         if isSpace(r) || isEndOfLine(r) {
 416                 return true
 417         }
 418         switch r {
 419         case eof, '.', ',', '|', ':', ')', '(':
 420                 return true
 421         }
 422         // Does r start the delimiter? This can be ambiguous (with delim=="//", $x/2 will
 423         // succeed but should fail) but only in extremely rare cases caused by willfully
 424         // bad choice of delimiter.
 425         if rd, _ := utf8.DecodeRuneInString(l.rightDelim); rd == r {
 426                 return true
 427         }
 428         return false
 429 }
 430
 431 // lexChar scans a character constant. The initial quote is already
 432 // scanned. Syntax checking is done by the parser.
 433 func lexChar(l *lexer) stateFn {
 434 Loop:
 435         for {
 436                 switch l.next() {
 437                 case '\\':
 438                         if r := l.next(); r != eof && r != '\n' {
 439                                 break
 440                         }
 441                         fallthrough
 442                 case eof, '\n':
 443                         return l.errorf("unterminated character constant")
 444                 case '\'':
 445                         break Loop
 446                 }
 447         }
 448         l.emit(itemCharConstant)
 449         return lexInsideAction
 450 }
 451
 452 // lexNumber scans a number: decimal, octal, hex, float, or imaginary. This
 453 // isn't a perfect number scanner - for instance it accepts "." and "0x0.2"
 454 // and "089" - but when it's wrong the input is invalid and the parser (via
 455 // strconv) will notice.
 456 func lexNumber(l *lexer) stateFn {
 457         if !l.scanNumber() {
 458                 return l.errorf("bad number syntax: %q", l.input[l.start:l.pos])
 459         }
 460         if sign := l.peek(); sign == '+' || sign == '-' {
 461                 // Complex: 1+2i. No spaces, must end in 'i'.
 462                 if !l.scanNumber() || l.input[l.pos-1] != 'i' {
 463                         return l.errorf("bad number syntax: %q", l.input[l.start:l.pos])
 464                 }
 465                 l.emit(itemComplex)
 466         } else {
 467                 l.emit(itemNumber)
 468         }
 469         return lexInsideAction
 470 }
 471
 472 func (l *lexer) scanNumber() bool {
 473         // Optional leading sign.
 474         l.accept("+-")
 475         // Is it hex?
 476         digits := "0123456789"
 477         if l.accept("0") && l.accept("xX") {
 478                 digits = "0123456789abcdefABCDEF"
 479         }
 480         l.acceptRun(digits)
 481         if l.accept(".") {
 482                 l.acceptRun(digits)
 483         }
 484         if l.accept("eE") {
 485                 l.accept("+-")
 486                 l.acceptRun("0123456789")
 487         }
 488         // Is it imaginary?
 489         l.accept("i")
 490         // Next thing mustn't be alphanumeric.
 491         if isAlphaNumeric(l.peek()) {
 492                 l.next()
 493                 return false
 494         }
 495         return true
 496 }
 497
 498 // lexQuote scans a quoted string.
 499 func lexQuote(l *lexer) stateFn {
 500 Loop:
 501         for {
 502                 switch l.next() {
 503                 case '\\':
 504                         if r := l.next(); r != eof && r != '\n' {
 505                                 break
 506                         }
 507                         fallthrough
 508                 case eof, '\n':
 509                         return l.errorf("unterminated quoted string")
 510                 case '"':
 511                         break Loop
 512                 }
 513         }
 514         l.emit(itemString)
 515         return lexInsideAction
 516 }
 517
 518 // lexRawQuote scans a raw quoted string.
 519 func lexRawQuote(l *lexer) stateFn {
 520 Loop:
 521         for {
 522                 switch l.next() {
 523                 case eof, '\n':
 524                         return l.errorf("unterminated raw quoted string")
 525                 case '`':
 526                         break Loop
 527                 }
 528         }
 529         l.emit(itemRawString)
 530         return lexInsideAction
 531 }
 532
 533 // isSpace reports whether r is a space character.
 534 func isSpace(r rune) bool {
 535         return r == ' ' || r == '\t'
 536 }
 537
 538 // isEndOfLine reports whether r is an end-of-line character.
 539 func isEndOfLine(r rune) bool {
 540         return r == '\r' || r == '\n'
 541 }
 542
 543 // isAlphaNumeric reports whether r is an alphabetic, digit, or underscore.
 544 func isAlphaNumeric(r rune) bool {
 545         return r == '_' || unicode.IsLetter(r) || unicode.IsDigit(r)
 546 }