libgo/go/json/scanner.go

   1 // Copyright 2010 The Go Authors.  All rights reserved.
   2 // Use of this source code is governed by a BSD-style
   3 // license that can be found in the LICENSE file.
   4
   5 package json
   6
   7 // JSON value parser state machine.
   8 // Just about at the limit of what is reasonable to write by hand.
   9 // Some parts are a bit tedious, but overall it nicely factors out the
  10 // otherwise common code from the multiple scanning functions
  11 // in this package (Compact, Indent, checkValid, nextValue, etc).
  12 //
  13 // This file starts with two simple examples using the scanner
  14 // before diving into the scanner itself.
  15
  16 import (
  17         "os"
  18         "strconv"
  19 )
  20
  21 // checkValid verifies that data is valid JSON-encoded data.
  22 // scan is passed in for use by checkValid to avoid an allocation.
  23 func checkValid(data []byte, scan *scanner) os.Error {
  24         scan.reset()
  25         for _, c := range data {
  26                 if scan.step(scan, int(c)) == scanError {
  27                         return scan.err
  28                 }
  29         }
  30         if scan.eof() == scanError {
  31                 return scan.err
  32         }
  33         return nil
  34 }
  35
  36 // nextValue splits data after the next whole JSON value,
  37 // returning that value and the bytes that follow it as separate slices.
  38 // scan is passed in for use by nextValue to avoid an allocation.
  39 func nextValue(data []byte, scan *scanner) (value, rest []byte, err os.Error) {
  40         scan.reset()
  41         for i, c := range data {
  42                 v := scan.step(scan, int(c))
  43                 if v >= scanEnd {
  44                         switch v {
  45                         case scanError:
  46                                 return nil, nil, scan.err
  47                         case scanEnd:
  48                                 return data[0:i], data[i:], nil
  49                         }
  50                 }
  51         }
  52         if scan.eof() == scanError {
  53                 return nil, nil, scan.err
  54         }
  55         return data, nil, nil
  56 }
  57
  58 // A SyntaxError is a description of a JSON syntax error.
  59 type SyntaxError string
  60
  61 func (e SyntaxError) String() string { return string(e) }
  62
  63
  64 // A scanner is a JSON scanning state machine.
  65 // Callers call scan.reset() and then pass bytes in one at a time
  66 // by calling scan.step(&scan, c) for each byte.
  67 // The return value, referred to as an opcode, tells the
  68 // caller about significant parsing events like beginning
  69 // and ending literals, objects, and arrays, so that the
  70 // caller can follow along if it wishes.
  71 // The return value scanEnd indicates that a single top-level
  72 // JSON value has been completed, *before* the byte that
  73 // just got passed in.  (The indication must be delayed in order
  74 // to recognize the end of numbers: is 123 a whole value or
  75 // the beginning of 12345e+6?).
  76 type scanner struct {
  77         // The step is a func to be called to execute the next transition.
  78         // Also tried using an integer constant and a single func
  79         // with a switch, but using the func directly was 10% faster
  80         // on a 64-bit Mac Mini, and it's nicer to read.
  81         step func(*scanner, int) int
  82
  83         // Stack of what we're in the middle of - array values, object keys, object values.
  84         parseState []int
  85
  86         // Error that happened, if any.
  87         err os.Error
  88
  89         // 1-byte redo (see undo method)
  90         redoCode  int
  91         redoState func(*scanner, int) int
  92 }
  93
  94 // These values are returned by the state transition functions
  95 // assigned to scanner.state and the method scanner.eof.
  96 // They give details about the current state of the scan that
  97 // callers might be interested to know about.
  98 // It is okay to ignore the return value of any particular
  99 // call to scanner.state: if one call returns scanError,
 100 // every subsequent call will return scanError too.
 101 const (
 102         // Continue.
 103         scanContinue     = iota // uninteresting byte
 104         scanBeginLiteral        // end implied by next result != scanContinue
 105         scanBeginObject         // begin object
 106         scanObjectKey           // just finished object key (string)
 107         scanObjectValue         // just finished non-last object value
 108         scanEndObject           // end object (implies scanObjectValue if possible)
 109         scanBeginArray          // begin array
 110         scanArrayValue          // just finished array value
 111         scanEndArray            // end array (implies scanArrayValue if possible)
 112         scanSkipSpace           // space byte; can skip; known to be last "continue" result
 113
 114         // Stop.
 115         scanEnd   // top-level value ended *before* this byte; known to be first "stop" result
 116         scanError // hit an error, scanner.err.
 117 )
 118
 119 // These values are stored in the parseState stack.
 120 // They give the current state of a composite value
 121 // being scanned.  If the parser is inside a nested value
 122 // the parseState describes the nested state, outermost at entry 0.
 123 const (
 124         parseObjectKey   = iota // parsing object key (before colon)
 125         parseObjectValue        // parsing object value (after colon)
 126         parseArrayValue         // parsing array value
 127 )
 128
 129 // reset prepares the scanner for use.
 130 // It must be called before calling s.step.
 131 func (s *scanner) reset() {
 132         s.step = stateBeginValue
 133         s.parseState = s.parseState[0:0]
 134         s.err = nil
 135 }
 136
 137 // eof tells the scanner that the end of input has been reached.
 138 // It returns a scan status just as s.step does.
 139 func (s *scanner) eof() int {
 140         if s.err != nil {
 141                 return scanError
 142         }
 143         if s.step == stateEndTop {
 144                 return scanEnd
 145         }
 146         s.step(s, ' ')
 147         if s.step == stateEndTop {
 148                 return scanEnd
 149         }
 150         if s.err == nil {
 151                 s.err = SyntaxError("unexpected end of JSON input")
 152         }
 153         return scanError
 154 }
 155
 156 // pushParseState pushes a new parse state p onto the parse stack.
 157 func (s *scanner) pushParseState(p int) {
 158         s.parseState = append(s.parseState, p)
 159 }
 160
 161 // popParseState pops a parse state (already obtained) off the stack
 162 // and updates s.step accordingly.
 163 func (s *scanner) popParseState() {
 164         n := len(s.parseState) - 1
 165         s.parseState = s.parseState[0:n]
 166         if n == 0 {
 167                 s.step = stateEndTop
 168         } else {
 169                 s.step = stateEndValue
 170         }
 171 }
 172
 173 func isSpace(c int) bool {
 174         return c == ' ' || c == '\t' || c == '\r' || c == '\n'
 175 }
 176
 177 // NOTE(rsc): The various instances of
 178 //
 179 //      if c <= ' ' && (c == ' ' || c == '\t' || c == '\r' || c == '\n')
 180 //
 181 // below should all be if c <= ' ' && isSpace(c), but inlining
 182 // the checks makes a significant difference (>10%) in tight loops
 183 // such as nextValue.  These should be rewritten with the clearer
 184 // function call once 6g knows to inline the call.
 185
 186 // stateBeginValueOrEmpty is the state after reading `[`.
 187 func stateBeginValueOrEmpty(s *scanner, c int) int {
 188         if c <= ' ' && (c == ' ' || c == '\t' || c == '\r' || c == '\n') {
 189                 return scanSkipSpace
 190         }
 191         if c == ']' {
 192                 return stateEndValue(s, c)
 193         }
 194         return stateBeginValue(s, c)
 195 }
 196
 197 // stateBeginValue is the state at the beginning of the input.
 198 func stateBeginValue(s *scanner, c int) int {
 199         if c <= ' ' && (c == ' ' || c == '\t' || c == '\r' || c == '\n') {
 200                 return scanSkipSpace
 201         }
 202         switch c {
 203         case '{':
 204                 s.step = stateBeginStringOrEmpty
 205                 s.pushParseState(parseObjectKey)
 206                 return scanBeginObject
 207         case '[':
 208                 s.step = stateBeginValueOrEmpty
 209                 s.pushParseState(parseArrayValue)
 210                 return scanBeginArray
 211         case '"':
 212                 s.step = stateInString
 213                 return scanBeginLiteral
 214         case '-':
 215                 s.step = stateNeg
 216                 return scanBeginLiteral
 217         case '0': // beginning of 0.123
 218                 s.step = state0
 219                 return scanBeginLiteral
 220         case 't': // beginning of true
 221                 s.step = stateT
 222                 return scanBeginLiteral
 223         case 'f': // beginning of false
 224                 s.step = stateF
 225                 return scanBeginLiteral
 226         case 'n': // beginning of null
 227                 s.step = stateN
 228                 return scanBeginLiteral
 229         }
 230         if '1' <= c && c <= '9' { // beginning of 1234.5
 231                 s.step = state1
 232                 return scanBeginLiteral
 233         }
 234         return s.error(c, "looking for beginning of value")
 235 }
 236
 237 // stateBeginStringOrEmpty is the state after reading `{`.
 238 func stateBeginStringOrEmpty(s *scanner, c int) int {
 239         if c <= ' ' && (c == ' ' || c == '\t' || c == '\r' || c == '\n') {
 240                 return scanSkipSpace
 241         }
 242         if c == '}' {
 243                 n := len(s.parseState)
 244                 s.parseState[n-1] = parseObjectValue
 245                 return stateEndValue(s, c)
 246         }
 247         return stateBeginString(s, c)
 248 }
 249
 250 // stateBeginString is the state after reading `{"key": value,`.
 251 func stateBeginString(s *scanner, c int) int {
 252         if c <= ' ' && (c == ' ' || c == '\t' || c == '\r' || c == '\n') {
 253                 return scanSkipSpace
 254         }
 255         if c == '"' {
 256                 s.step = stateInString
 257                 return scanBeginLiteral
 258         }
 259         return s.error(c, "looking for beginning of object key string")
 260 }
 261
 262 // stateEndValue is the state after completing a value,
 263 // such as after reading `{}` or `true` or `["x"`.
 264 func stateEndValue(s *scanner, c int) int {
 265         n := len(s.parseState)
 266         if n == 0 {
 267                 // Completed top-level before the current byte.
 268                 s.step = stateEndTop
 269                 return stateEndTop(s, c)
 270         }
 271         if c <= ' ' && (c == ' ' || c == '\t' || c == '\r' || c == '\n') {
 272                 s.step = stateEndValue
 273                 return scanSkipSpace
 274         }
 275         ps := s.parseState[n-1]
 276         switch ps {
 277         case parseObjectKey:
 278                 if c == ':' {
 279                         s.parseState[n-1] = parseObjectValue
 280                         s.step = stateBeginValue
 281                         return scanObjectKey
 282                 }
 283                 return s.error(c, "after object key")
 284         case parseObjectValue:
 285                 if c == ',' {
 286                         s.parseState[n-1] = parseObjectKey
 287                         s.step = stateBeginString
 288                         return scanObjectValue
 289                 }
 290                 if c == '}' {
 291                         s.popParseState()
 292                         return scanEndObject
 293                 }
 294                 return s.error(c, "after object key:value pair")
 295         case parseArrayValue:
 296                 if c == ',' {
 297                         s.step = stateBeginValue
 298                         return scanArrayValue
 299                 }
 300                 if c == ']' {
 301                         s.popParseState()
 302                         return scanEndArray
 303                 }
 304                 return s.error(c, "after array element")
 305         }
 306         return s.error(c, "")
 307 }
 308
 309 // stateEndTop is the state after finishing the top-level value,
 310 // such as after reading `{}` or `[1,2,3]`.
 311 // Only space characters should be seen now.
 312 func stateEndTop(s *scanner, c int) int {
 313         if c != ' ' && c != '\t' && c != '\r' && c != '\n' {
 314                 // Complain about non-space byte on next call.
 315                 s.error(c, "after top-level value")
 316         }
 317         return scanEnd
 318 }
 319
 320 // stateInString is the state after reading `"`.
 321 func stateInString(s *scanner, c int) int {
 322         if c == '"' {
 323                 s.step = stateEndValue
 324                 return scanContinue
 325         }
 326         if c == '\\' {
 327                 s.step = stateInStringEsc
 328                 return scanContinue
 329         }
 330         if c < 0x20 {
 331                 return s.error(c, "in string literal")
 332         }
 333         return scanContinue
 334 }
 335
 336 // stateInStringEsc is the state after reading `"\` during a quoted string.
 337 func stateInStringEsc(s *scanner, c int) int {
 338         switch c {
 339         case 'b', 'f', 'n', 'r', 't', '\\', '/', '"':
 340                 s.step = stateInString
 341                 return scanContinue
 342         }
 343         if c == 'u' {
 344                 s.step = stateInStringEscU
 345                 return scanContinue
 346         }
 347         return s.error(c, "in string escape code")
 348 }
 349
 350 // stateInStringEscU is the state after reading `"\u` during a quoted string.
 351 func stateInStringEscU(s *scanner, c int) int {
 352         if '0' <= c && c <= '9' || 'a' <= c && c <= 'f' || 'A' <= c && c <= 'F' {
 353                 s.step = stateInStringEscU1
 354                 return scanContinue
 355         }
 356         // numbers
 357         return s.error(c, "in \\u hexadecimal character escape")
 358 }
 359
 360 // stateInStringEscU1 is the state after reading `"\u1` during a quoted string.
 361 func stateInStringEscU1(s *scanner, c int) int {
 362         if '0' <= c && c <= '9' || 'a' <= c && c <= 'f' || 'A' <= c && c <= 'F' {
 363                 s.step = stateInStringEscU12
 364                 return scanContinue
 365         }
 366         // numbers
 367         return s.error(c, "in \\u hexadecimal character escape")
 368 }
 369
 370 // stateInStringEscU12 is the state after reading `"\u12` during a quoted string.
 371 func stateInStringEscU12(s *scanner, c int) int {
 372         if '0' <= c && c <= '9' || 'a' <= c && c <= 'f' || 'A' <= c && c <= 'F' {
 373                 s.step = stateInStringEscU123
 374                 return scanContinue
 375         }
 376         // numbers
 377         return s.error(c, "in \\u hexadecimal character escape")
 378 }
 379
 380 // stateInStringEscU123 is the state after reading `"\u123` during a quoted string.
 381 func stateInStringEscU123(s *scanner, c int) int {
 382         if '0' <= c && c <= '9' || 'a' <= c && c <= 'f' || 'A' <= c && c <= 'F' {
 383                 s.step = stateInString
 384                 return scanContinue
 385         }
 386         // numbers
 387         return s.error(c, "in \\u hexadecimal character escape")
 388 }
 389
 390 // stateInStringEscU123 is the state after reading `-` during a number.
 391 func stateNeg(s *scanner, c int) int {
 392         if c == '0' {
 393                 s.step = state0
 394                 return scanContinue
 395         }
 396         if '1' <= c && c <= '9' {
 397                 s.step = state1
 398                 return scanContinue
 399         }
 400         return s.error(c, "in numeric literal")
 401 }
 402
 403 // state1 is the state after reading a non-zero integer during a number,
 404 // such as after reading `1` or `100` but not `0`.
 405 func state1(s *scanner, c int) int {
 406         if '0' <= c && c <= '9' {
 407                 s.step = state1
 408                 return scanContinue
 409         }
 410         return state0(s, c)
 411 }
 412
 413 // state0 is the state after reading `0` during a number.
 414 func state0(s *scanner, c int) int {
 415         if c == '.' {
 416                 s.step = stateDot
 417                 return scanContinue
 418         }
 419         if c == 'e' {
 420                 s.step = stateE
 421                 return scanContinue
 422         }
 423         return stateEndValue(s, c)
 424 }
 425
 426 // stateDot is the state after reading the integer and decimal point in a number,
 427 // such as after reading `1.`.
 428 func stateDot(s *scanner, c int) int {
 429         if '0' <= c && c <= '9' {
 430                 s.step = stateDot0
 431                 return scanContinue
 432         }
 433         return s.error(c, "after decimal point in numeric literal")
 434 }
 435
 436 // stateDot0 is the state after reading the integer, decimal point, and subsequent
 437 // digits of a number, such as after reading `3.14`.
 438 func stateDot0(s *scanner, c int) int {
 439         if '0' <= c && c <= '9' {
 440                 s.step = stateDot0
 441                 return scanContinue
 442         }
 443         if c == 'e' {
 444                 s.step = stateE
 445                 return scanContinue
 446         }
 447         return stateEndValue(s, c)
 448 }
 449
 450 // stateE is the state after reading the mantissa and e in a number,
 451 // such as after reading `314e` or `0.314e`.
 452 func stateE(s *scanner, c int) int {
 453         if c == '+' {
 454                 s.step = stateESign
 455                 return scanContinue
 456         }
 457         if c == '-' {
 458                 s.step = stateESign
 459                 return scanContinue
 460         }
 461         return stateESign(s, c)
 462 }
 463
 464 // stateESign is the state after reading the mantissa, e, and sign in a number,
 465 // such as after reading `314e-` or `0.314e+`.
 466 func stateESign(s *scanner, c int) int {
 467         if '0' <= c && c <= '9' {
 468                 s.step = stateE0
 469                 return scanContinue
 470         }
 471         return s.error(c, "in exponent of numeric literal")
 472 }
 473
 474 // stateE0 is the state after reading the mantissa, e, optional sign,
 475 // and at least one digit of the exponent in a number,
 476 // such as after reading `314e-2` or `0.314e+1` or `3.14e0`.
 477 func stateE0(s *scanner, c int) int {
 478         if '0' <= c && c <= '9' {
 479                 s.step = stateE0
 480                 return scanContinue
 481         }
 482         return stateEndValue(s, c)
 483 }
 484
 485 // stateT is the state after reading `t`.
 486 func stateT(s *scanner, c int) int {
 487         if c == 'r' {
 488                 s.step = stateTr
 489                 return scanContinue
 490         }
 491         return s.error(c, "in literal true (expecting 'r')")
 492 }
 493
 494 // stateTr is the state after reading `tr`.
 495 func stateTr(s *scanner, c int) int {
 496         if c == 'u' {
 497                 s.step = stateTru
 498                 return scanContinue
 499         }
 500         return s.error(c, "in literal true (expecting 'u')")
 501 }
 502
 503 // stateTru is the state after reading `tru`.
 504 func stateTru(s *scanner, c int) int {
 505         if c == 'e' {
 506                 s.step = stateEndValue
 507                 return scanContinue
 508         }
 509         return s.error(c, "in literal true (expecting 'e')")
 510 }
 511
 512 // stateF is the state after reading `f`.
 513 func stateF(s *scanner, c int) int {
 514         if c == 'a' {
 515                 s.step = stateFa
 516                 return scanContinue
 517         }
 518         return s.error(c, "in literal false (expecting 'a')")
 519 }
 520
 521 // stateFa is the state after reading `fa`.
 522 func stateFa(s *scanner, c int) int {
 523         if c == 'l' {
 524                 s.step = stateFal
 525                 return scanContinue
 526         }
 527         return s.error(c, "in literal false (expecting 'l')")
 528 }
 529
 530 // stateFal is the state after reading `fal`.
 531 func stateFal(s *scanner, c int) int {
 532         if c == 's' {
 533                 s.step = stateFals
 534                 return scanContinue
 535         }
 536         return s.error(c, "in literal false (expecting 's')")
 537 }
 538
 539 // stateFals is the state after reading `fals`.
 540 func stateFals(s *scanner, c int) int {
 541         if c == 'e' {
 542                 s.step = stateEndValue
 543                 return scanContinue
 544         }
 545         return s.error(c, "in literal false (expecting 'e')")
 546 }
 547
 548 // stateN is the state after reading `n`.
 549 func stateN(s *scanner, c int) int {
 550         if c == 'u' {
 551                 s.step = stateNu
 552                 return scanContinue
 553         }
 554         return s.error(c, "in literal null (expecting 'u')")
 555 }
 556
 557 // stateNu is the state after reading `nu`.
 558 func stateNu(s *scanner, c int) int {
 559         if c == 'l' {
 560                 s.step = stateNul
 561                 return scanContinue
 562         }
 563         return s.error(c, "in literal null (expecting 'l')")
 564 }
 565
 566 // stateNul is the state after reading `nul`.
 567 func stateNul(s *scanner, c int) int {
 568         if c == 'l' {
 569                 s.step = stateEndValue
 570                 return scanContinue
 571         }
 572         return s.error(c, "in literal null (expecting 'l')")
 573 }
 574
 575 // stateError is the state after reaching a syntax error,
 576 // such as after reading `[1}` or `5.1.2`.
 577 func stateError(s *scanner, c int) int {
 578         return scanError
 579 }
 580
 581 // error records an error and switches to the error state.
 582 func (s *scanner) error(c int, context string) int {
 583         s.step = stateError
 584         s.err = SyntaxError("invalid character " + quoteChar(c) + " " + context)
 585         return scanError
 586 }
 587
 588 // quoteChar formats c as a quoted character literal
 589 func quoteChar(c int) string {
 590         // special cases - different from quoted strings
 591         if c == '\'' {
 592                 return `'\''`
 593         }
 594         if c == '"' {
 595                 return `'"'`
 596         }
 597
 598         // use quoted string with different quotation marks
 599         s := strconv.Quote(string(c))
 600         return "'" + s[1:len(s)-1] + "'"
 601 }
 602
 603 // undo causes the scanner to return scanCode from the next state transition.
 604 // This gives callers a simple 1-byte undo mechanism.
 605 func (s *scanner) undo(scanCode int) {
 606         if s.step == stateRedo {
 607                 panic("invalid use of scanner")
 608         }
 609         s.redoCode = scanCode
 610         s.redoState = s.step
 611         s.step = stateRedo
 612 }
 613
 614 // stateRedo helps implement the scanner's 1-byte undo.
 615 func stateRedo(s *scanner, c int) int {
 616         s.step = s.redoState
 617         return s.redoCode
 618 }