python/google/protobuf/text_format.py

   1 # Protocol Buffers - Google's data interchange format
   2 # Copyright 2008 Google Inc.  All rights reserved.
   3 # https://developers.google.com/protocol-buffers/
   4 #
   5 # Redistribution and use in source and binary forms, with or without
   6 # modification, are permitted provided that the following conditions are
   7 # met:
   8 #
   9 #     * Redistributions of source code must retain the above copyright
  10 # notice, this list of conditions and the following disclaimer.
  11 #     * Redistributions in binary form must reproduce the above
  12 # copyright notice, this list of conditions and the following disclaimer
  13 # in the documentation and/or other materials provided with the
  14 # distribution.
  15 #     * Neither the name of Google Inc. nor the names of its
  16 # contributors may be used to endorse or promote products derived from
  17 # this software without specific prior written permission.
  18 #
  19 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  20 # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  21 # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  22 # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  23 # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  24 # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  25 # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  26 # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  27 # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  28 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  29 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  30
  31 #PY25 compatible for GAE.
  32 #
  33 # Copyright 2007 Google Inc. All Rights Reserved.
  34
  35 """Contains routines for printing protocol messages in text format."""
  36
  37 __author__ = 'kenton@google.com (Kenton Varda)'
  38
  39 import cStringIO
  40 import re
  41
  42 from google.protobuf.internal import type_checkers
  43 from google.protobuf import descriptor
  44 from google.protobuf import text_encoding
  45
  46 __all__ = ['MessageToString', 'PrintMessage', 'PrintField',
  47            'PrintFieldValue', 'Merge']
  48
  49
  50 _INTEGER_CHECKERS = (type_checkers.Uint32ValueChecker(),
  51                      type_checkers.Int32ValueChecker(),
  52                      type_checkers.Uint64ValueChecker(),
  53                      type_checkers.Int64ValueChecker())
  54 _FLOAT_INFINITY = re.compile('-?inf(?:inity)?f?', re.IGNORECASE)
  55 _FLOAT_NAN = re.compile('nanf?', re.IGNORECASE)
  56 _FLOAT_TYPES = frozenset([descriptor.FieldDescriptor.CPPTYPE_FLOAT,
  57                           descriptor.FieldDescriptor.CPPTYPE_DOUBLE])
  58
  59
  60 class Error(Exception):
  61   """Top-level module error for text_format."""
  62
  63
  64 class ParseError(Error):
  65   """Thrown in case of ASCII parsing error."""
  66
  67
  68 def MessageToString(message, as_utf8=False, as_one_line=False,
  69                     pointy_brackets=False, use_index_order=False,
  70                     float_format=None):
  71   """Convert protobuf message to text format.
  72
  73   Floating point values can be formatted compactly with 15 digits of
  74   precision (which is the most that IEEE 754 "double" can guarantee)
  75   using float_format='.15g'.
  76
  77   Args:
  78     message: The protocol buffers message.
  79     as_utf8: Produce text output in UTF8 format.
  80     as_one_line: Don't introduce newlines between fields.
  81     pointy_brackets: If True, use angle brackets instead of curly braces for
  82       nesting.
  83     use_index_order: If True, print fields of a proto message using the order
  84       defined in source code instead of the field number. By default, use the
  85       field number order.
  86     float_format: If set, use this to specify floating point number formatting
  87       (per the "Format Specification Mini-Language"); otherwise, str() is used.
  88
  89   Returns:
  90     A string of the text formatted protocol buffer message.
  91   """
  92   out = cStringIO.StringIO()
  93   PrintMessage(message, out, as_utf8=as_utf8, as_one_line=as_one_line,
  94                pointy_brackets=pointy_brackets,
  95                use_index_order=use_index_order,
  96                float_format=float_format)
  97   result = out.getvalue()
  98   out.close()
  99   if as_one_line:
 100     return result.rstrip()
 101   return result
 102
 103
 104 def PrintMessage(message, out, indent=0, as_utf8=False, as_one_line=False,
 105                  pointy_brackets=False, use_index_order=False,
 106                  float_format=None):
 107   fields = message.ListFields()
 108   if use_index_order:
 109     fields.sort(key=lambda x: x[0].index)
 110   for field, value in fields:
 111     if field.label == descriptor.FieldDescriptor.LABEL_REPEATED:
 112       for element in value:
 113         PrintField(field, element, out, indent, as_utf8, as_one_line,
 114                    pointy_brackets=pointy_brackets,
 115                    float_format=float_format)
 116     else:
 117       PrintField(field, value, out, indent, as_utf8, as_one_line,
 118                  pointy_brackets=pointy_brackets,
 119                  float_format=float_format)
 120
 121
 122 def PrintField(field, value, out, indent=0, as_utf8=False, as_one_line=False,
 123                pointy_brackets=False, float_format=None):
 124   """Print a single field name/value pair.  For repeated fields, the value
 125   should be a single element."""
 126
 127   out.write(' ' * indent)
 128   if field.is_extension:
 129     out.write('[')
 130     if (field.containing_type.GetOptions().message_set_wire_format and
 131         field.type == descriptor.FieldDescriptor.TYPE_MESSAGE and
 132         field.message_type == field.extension_scope and
 133         field.label == descriptor.FieldDescriptor.LABEL_OPTIONAL):
 134       out.write(field.message_type.full_name)
 135     else:
 136       out.write(field.full_name)
 137     out.write(']')
 138   elif field.type == descriptor.FieldDescriptor.TYPE_GROUP:
 139     # For groups, use the capitalized name.
 140     out.write(field.message_type.name)
 141   else:
 142     out.write(field.name)
 143
 144   if field.cpp_type != descriptor.FieldDescriptor.CPPTYPE_MESSAGE:
 145     # The colon is optional in this case, but our cross-language golden files
 146     # don't include it.
 147     out.write(': ')
 148
 149   PrintFieldValue(field, value, out, indent, as_utf8, as_one_line,
 150                   pointy_brackets=pointy_brackets,
 151                   float_format=float_format)
 152   if as_one_line:
 153     out.write(' ')
 154   else:
 155     out.write('\n')
 156
 157
 158 def PrintFieldValue(field, value, out, indent=0, as_utf8=False,
 159                     as_one_line=False, pointy_brackets=False,
 160                     float_format=None):
 161   """Print a single field value (not including name).  For repeated fields,
 162   the value should be a single element."""
 163
 164   if pointy_brackets:
 165     openb = '<'
 166     closeb = '>'
 167   else:
 168     openb = '{'
 169     closeb = '}'
 170
 171   if field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_MESSAGE:
 172     if as_one_line:
 173       out.write(' %s ' % openb)
 174       PrintMessage(value, out, indent, as_utf8, as_one_line,
 175                    pointy_brackets=pointy_brackets,
 176                    float_format=float_format)
 177       out.write(closeb)
 178     else:
 179       out.write(' %s\n' % openb)
 180       PrintMessage(value, out, indent + 2, as_utf8, as_one_line,
 181                    pointy_brackets=pointy_brackets,
 182                    float_format=float_format)
 183       out.write(' ' * indent + closeb)
 184   elif field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_ENUM:
 185     enum_value = field.enum_type.values_by_number.get(value, None)
 186     if enum_value is not None:
 187       out.write(enum_value.name)
 188     else:
 189       out.write(str(value))
 190   elif field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_STRING:
 191     out.write('\"')
 192     if isinstance(value, unicode):
 193       out_value = value.encode('utf-8')
 194     else:
 195       out_value = value
 196     if field.type == descriptor.FieldDescriptor.TYPE_BYTES:
 197       # We need to escape non-UTF8 chars in TYPE_BYTES field.
 198       out_as_utf8 = False
 199     else:
 200       out_as_utf8 = as_utf8
 201     out.write(text_encoding.CEscape(out_value, out_as_utf8))
 202     out.write('\"')
 203   elif field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_BOOL:
 204     if value:
 205       out.write('true')
 206     else:
 207       out.write('false')
 208   elif field.cpp_type in _FLOAT_TYPES and float_format is not None:
 209     out.write('{1:{0}}'.format(float_format, value))
 210   else:
 211     out.write(str(value))
 212
 213
 214 def _ParseOrMerge(lines, message, allow_multiple_scalars):
 215   """Converts an ASCII representation of a protocol message into a message.
 216
 217   Args:
 218     lines: Lines of a message's ASCII representation.
 219     message: A protocol buffer message to merge into.
 220     allow_multiple_scalars: Determines if repeated values for a non-repeated
 221       field are permitted, e.g., the string "foo: 1 foo: 2" for a
 222       required/optional field named "foo".
 223
 224   Raises:
 225     ParseError: On ASCII parsing problems.
 226   """
 227   tokenizer = _Tokenizer(lines)
 228   while not tokenizer.AtEnd():
 229     _MergeField(tokenizer, message, allow_multiple_scalars)
 230
 231
 232 def Parse(text, message):
 233   """Parses an ASCII representation of a protocol message into a message.
 234
 235   Args:
 236     text: Message ASCII representation.
 237     message: A protocol buffer message to merge into.
 238
 239   Returns:
 240     The same message passed as argument.
 241
 242   Raises:
 243     ParseError: On ASCII parsing problems.
 244   """
 245   if not isinstance(text, str): text = text.decode('utf-8')
 246   return ParseLines(text.split('\n'), message)
 247
 248
 249 def Merge(text, message):
 250   """Parses an ASCII representation of a protocol message into a message.
 251
 252   Like Parse(), but allows repeated values for a non-repeated field, and uses
 253   the last one.
 254
 255   Args:
 256     text: Message ASCII representation.
 257     message: A protocol buffer message to merge into.
 258
 259   Returns:
 260     The same message passed as argument.
 261
 262   Raises:
 263     ParseError: On ASCII parsing problems.
 264   """
 265   return MergeLines(text.split('\n'), message)
 266
 267
 268 def ParseLines(lines, message):
 269   """Parses an ASCII representation of a protocol message into a message.
 270
 271   Args:
 272     lines: An iterable of lines of a message's ASCII representation.
 273     message: A protocol buffer message to merge into.
 274
 275   Returns:
 276     The same message passed as argument.
 277
 278   Raises:
 279     ParseError: On ASCII parsing problems.
 280   """
 281   _ParseOrMerge(lines, message, False)
 282   return message
 283
 284
 285 def MergeLines(lines, message):
 286   """Parses an ASCII representation of a protocol message into a message.
 287
 288   Args:
 289     lines: An iterable of lines of a message's ASCII representation.
 290     message: A protocol buffer message to merge into.
 291
 292   Returns:
 293     The same message passed as argument.
 294
 295   Raises:
 296     ParseError: On ASCII parsing problems.
 297   """
 298   _ParseOrMerge(lines, message, True)
 299   return message
 300
 301
 302 def _MergeField(tokenizer, message, allow_multiple_scalars):
 303   """Merges a single protocol message field into a message.
 304
 305   Args:
 306     tokenizer: A tokenizer to parse the field name and values.
 307     message: A protocol message to record the data.
 308     allow_multiple_scalars: Determines if repeated values for a non-repeated
 309       field are permitted, e.g., the string "foo: 1 foo: 2" for a
 310       required/optional field named "foo".
 311
 312   Raises:
 313     ParseError: In case of ASCII parsing problems.
 314   """
 315   message_descriptor = message.DESCRIPTOR
 316   if tokenizer.TryConsume('['):
 317     name = [tokenizer.ConsumeIdentifier()]
 318     while tokenizer.TryConsume('.'):
 319       name.append(tokenizer.ConsumeIdentifier())
 320     name = '.'.join(name)
 321
 322     if not message_descriptor.is_extendable:
 323       raise tokenizer.ParseErrorPreviousToken(
 324           'Message type "%s" does not have extensions.' %
 325           message_descriptor.full_name)
 326     # pylint: disable=protected-access
 327     field = message.Extensions._FindExtensionByName(name)
 328     # pylint: enable=protected-access
 329     if not field:
 330       raise tokenizer.ParseErrorPreviousToken(
 331           'Extension "%s" not registered.' % name)
 332     elif message_descriptor != field.containing_type:
 333       raise tokenizer.ParseErrorPreviousToken(
 334           'Extension "%s" does not extend message type "%s".' % (
 335               name, message_descriptor.full_name))
 336     tokenizer.Consume(']')
 337   else:
 338     name = tokenizer.ConsumeIdentifier()
 339     field = message_descriptor.fields_by_name.get(name, None)
 340
 341     # Group names are expected to be capitalized as they appear in the
 342     # .proto file, which actually matches their type names, not their field
 343     # names.
 344     if not field:
 345       field = message_descriptor.fields_by_name.get(name.lower(), None)
 346       if field and field.type != descriptor.FieldDescriptor.TYPE_GROUP:
 347         field = None
 348
 349     if (field and field.type == descriptor.FieldDescriptor.TYPE_GROUP and
 350         field.message_type.name != name):
 351       field = None
 352
 353     if not field:
 354       raise tokenizer.ParseErrorPreviousToken(
 355           'Message type "%s" has no field named "%s".' % (
 356               message_descriptor.full_name, name))
 357
 358   if field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_MESSAGE:
 359     tokenizer.TryConsume(':')
 360
 361     if tokenizer.TryConsume('<'):
 362       end_token = '>'
 363     else:
 364       tokenizer.Consume('{')
 365       end_token = '}'
 366
 367     if field.label == descriptor.FieldDescriptor.LABEL_REPEATED:
 368       if field.is_extension:
 369         sub_message = message.Extensions[field].add()
 370       else:
 371         sub_message = getattr(message, field.name).add()
 372     else:
 373       if field.is_extension:
 374         sub_message = message.Extensions[field]
 375       else:
 376         sub_message = getattr(message, field.name)
 377       sub_message.SetInParent()
 378
 379     while not tokenizer.TryConsume(end_token):
 380       if tokenizer.AtEnd():
 381         raise tokenizer.ParseErrorPreviousToken('Expected "%s".' % (end_token))
 382       _MergeField(tokenizer, sub_message, allow_multiple_scalars)
 383   else:
 384     _MergeScalarField(tokenizer, message, field, allow_multiple_scalars)
 385
 386   # For historical reasons, fields may optionally be separated by commas or
 387   # semicolons.
 388   if not tokenizer.TryConsume(','):
 389     tokenizer.TryConsume(';')
 390
 391
 392 def _MergeScalarField(tokenizer, message, field, allow_multiple_scalars):
 393   """Merges a single protocol message scalar field into a message.
 394
 395   Args:
 396     tokenizer: A tokenizer to parse the field value.
 397     message: A protocol message to record the data.
 398     field: The descriptor of the field to be merged.
 399     allow_multiple_scalars: Determines if repeated values for a non-repeated
 400       field are permitted, e.g., the string "foo: 1 foo: 2" for a
 401       required/optional field named "foo".
 402
 403   Raises:
 404     ParseError: In case of ASCII parsing problems.
 405     RuntimeError: On runtime errors.
 406   """
 407   tokenizer.Consume(':')
 408   value = None
 409
 410   if field.type in (descriptor.FieldDescriptor.TYPE_INT32,
 411                     descriptor.FieldDescriptor.TYPE_SINT32,
 412                     descriptor.FieldDescriptor.TYPE_SFIXED32):
 413     value = tokenizer.ConsumeInt32()
 414   elif field.type in (descriptor.FieldDescriptor.TYPE_INT64,
 415                       descriptor.FieldDescriptor.TYPE_SINT64,
 416                       descriptor.FieldDescriptor.TYPE_SFIXED64):
 417     value = tokenizer.ConsumeInt64()
 418   elif field.type in (descriptor.FieldDescriptor.TYPE_UINT32,
 419                       descriptor.FieldDescriptor.TYPE_FIXED32):
 420     value = tokenizer.ConsumeUint32()
 421   elif field.type in (descriptor.FieldDescriptor.TYPE_UINT64,
 422                       descriptor.FieldDescriptor.TYPE_FIXED64):
 423     value = tokenizer.ConsumeUint64()
 424   elif field.type in (descriptor.FieldDescriptor.TYPE_FLOAT,
 425                       descriptor.FieldDescriptor.TYPE_DOUBLE):
 426     value = tokenizer.ConsumeFloat()
 427   elif field.type == descriptor.FieldDescriptor.TYPE_BOOL:
 428     value = tokenizer.ConsumeBool()
 429   elif field.type == descriptor.FieldDescriptor.TYPE_STRING:
 430     value = tokenizer.ConsumeString()
 431   elif field.type == descriptor.FieldDescriptor.TYPE_BYTES:
 432     value = tokenizer.ConsumeByteString()
 433   elif field.type == descriptor.FieldDescriptor.TYPE_ENUM:
 434     value = tokenizer.ConsumeEnum(field)
 435   else:
 436     raise RuntimeError('Unknown field type %d' % field.type)
 437
 438   if field.label == descriptor.FieldDescriptor.LABEL_REPEATED:
 439     if field.is_extension:
 440       message.Extensions[field].append(value)
 441     else:
 442       getattr(message, field.name).append(value)
 443   else:
 444     if field.is_extension:
 445       if not allow_multiple_scalars and message.HasExtension(field):
 446         raise tokenizer.ParseErrorPreviousToken(
 447             'Message type "%s" should not have multiple "%s" extensions.' %
 448             (message.DESCRIPTOR.full_name, field.full_name))
 449       else:
 450         message.Extensions[field] = value
 451     else:
 452       if not allow_multiple_scalars and message.HasField(field.name):
 453         raise tokenizer.ParseErrorPreviousToken(
 454             'Message type "%s" should not have multiple "%s" fields.' %
 455             (message.DESCRIPTOR.full_name, field.name))
 456       else:
 457         setattr(message, field.name, value)
 458
 459
 460 class _Tokenizer(object):
 461   """Protocol buffer ASCII representation tokenizer.
 462
 463   This class handles the lower level string parsing by splitting it into
 464   meaningful tokens.
 465
 466   It was directly ported from the Java protocol buffer API.
 467   """
 468
 469   _WHITESPACE = re.compile('(\\s|(#.*$))+', re.MULTILINE)
 470   _TOKEN = re.compile(
 471       '[a-zA-Z_][0-9a-zA-Z_+-]*|'           # an identifier
 472       '[0-9+-][0-9a-zA-Z_.+-]*|'            # a number
 473       '\"([^\"\n\\\\]|\\\\.)*(\"|\\\\?$)|'  # a double-quoted string
 474       '\'([^\'\n\\\\]|\\\\.)*(\'|\\\\?$)')  # a single-quoted string
 475   _IDENTIFIER = re.compile(r'\w+')
 476
 477   def __init__(self, lines):
 478     self._position = 0
 479     self._line = -1
 480     self._column = 0
 481     self._token_start = None
 482     self.token = ''
 483     self._lines = iter(lines)
 484     self._current_line = ''
 485     self._previous_line = 0
 486     self._previous_column = 0
 487     self._more_lines = True
 488     self._SkipWhitespace()
 489     self.NextToken()
 490
 491   def AtEnd(self):
 492     """Checks the end of the text was reached.
 493
 494     Returns:
 495       True iff the end was reached.
 496     """
 497     return not self.token
 498
 499   def _PopLine(self):
 500     while len(self._current_line) <= self._column:
 501       try:
 502         self._current_line = self._lines.next()
 503       except StopIteration:
 504         self._current_line = ''
 505         self._more_lines = False
 506         return
 507       else:
 508         self._line += 1
 509         self._column = 0
 510
 511   def _SkipWhitespace(self):
 512     while True:
 513       self._PopLine()
 514       match = self._WHITESPACE.match(self._current_line, self._column)
 515       if not match:
 516         break
 517       length = len(match.group(0))
 518       self._column += length
 519
 520   def TryConsume(self, token):
 521     """Tries to consume a given piece of text.
 522
 523     Args:
 524       token: Text to consume.
 525
 526     Returns:
 527       True iff the text was consumed.
 528     """
 529     if self.token == token:
 530       self.NextToken()
 531       return True
 532     return False
 533
 534   def Consume(self, token):
 535     """Consumes a piece of text.
 536
 537     Args:
 538       token: Text to consume.
 539
 540     Raises:
 541       ParseError: If the text couldn't be consumed.
 542     """
 543     if not self.TryConsume(token):
 544       raise self._ParseError('Expected "%s".' % token)
 545
 546   def ConsumeIdentifier(self):
 547     """Consumes protocol message field identifier.
 548
 549     Returns:
 550       Identifier string.
 551
 552     Raises:
 553       ParseError: If an identifier couldn't be consumed.
 554     """
 555     result = self.token
 556     if not self._IDENTIFIER.match(result):
 557       raise self._ParseError('Expected identifier.')
 558     self.NextToken()
 559     return result
 560
 561   def ConsumeInt32(self):
 562     """Consumes a signed 32bit integer number.
 563
 564     Returns:
 565       The integer parsed.
 566
 567     Raises:
 568       ParseError: If a signed 32bit integer couldn't be consumed.
 569     """
 570     try:
 571       result = ParseInteger(self.token, is_signed=True, is_long=False)
 572     except ValueError, e:
 573       raise self._ParseError(str(e))
 574     self.NextToken()
 575     return result
 576
 577   def ConsumeUint32(self):
 578     """Consumes an unsigned 32bit integer number.
 579
 580     Returns:
 581       The integer parsed.
 582
 583     Raises:
 584       ParseError: If an unsigned 32bit integer couldn't be consumed.
 585     """
 586     try:
 587       result = ParseInteger(self.token, is_signed=False, is_long=False)
 588     except ValueError, e:
 589       raise self._ParseError(str(e))
 590     self.NextToken()
 591     return result
 592
 593   def ConsumeInt64(self):
 594     """Consumes a signed 64bit integer number.
 595
 596     Returns:
 597       The integer parsed.
 598
 599     Raises:
 600       ParseError: If a signed 64bit integer couldn't be consumed.
 601     """
 602     try:
 603       result = ParseInteger(self.token, is_signed=True, is_long=True)
 604     except ValueError, e:
 605       raise self._ParseError(str(e))
 606     self.NextToken()
 607     return result
 608
 609   def ConsumeUint64(self):
 610     """Consumes an unsigned 64bit integer number.
 611
 612     Returns:
 613       The integer parsed.
 614
 615     Raises:
 616       ParseError: If an unsigned 64bit integer couldn't be consumed.
 617     """
 618     try:
 619       result = ParseInteger(self.token, is_signed=False, is_long=True)
 620     except ValueError, e:
 621       raise self._ParseError(str(e))
 622     self.NextToken()
 623     return result
 624
 625   def ConsumeFloat(self):
 626     """Consumes an floating point number.
 627
 628     Returns:
 629       The number parsed.
 630
 631     Raises:
 632       ParseError: If a floating point number couldn't be consumed.
 633     """
 634     try:
 635       result = ParseFloat(self.token)
 636     except ValueError, e:
 637       raise self._ParseError(str(e))
 638     self.NextToken()
 639     return result
 640
 641   def ConsumeBool(self):
 642     """Consumes a boolean value.
 643
 644     Returns:
 645       The bool parsed.
 646
 647     Raises:
 648       ParseError: If a boolean value couldn't be consumed.
 649     """
 650     try:
 651       result = ParseBool(self.token)
 652     except ValueError, e:
 653       raise self._ParseError(str(e))
 654     self.NextToken()
 655     return result
 656
 657   def ConsumeString(self):
 658     """Consumes a string value.
 659
 660     Returns:
 661       The string parsed.
 662
 663     Raises:
 664       ParseError: If a string value couldn't be consumed.
 665     """
 666     the_bytes = self.ConsumeByteString()
 667     try:
 668       return unicode(the_bytes, 'utf-8')
 669     except UnicodeDecodeError, e:
 670       raise self._StringParseError(e)
 671
 672   def ConsumeByteString(self):
 673     """Consumes a byte array value.
 674
 675     Returns:
 676       The array parsed (as a string).
 677
 678     Raises:
 679       ParseError: If a byte array value couldn't be consumed.
 680     """
 681     the_list = [self._ConsumeSingleByteString()]
 682     while self.token and self.token[0] in ('\'', '"'):
 683       the_list.append(self._ConsumeSingleByteString())
 684     return ''.encode('latin1').join(the_list)  ##PY25
 685 ##!PY25    return b''.join(the_list)
 686
 687   def _ConsumeSingleByteString(self):
 688     """Consume one token of a string literal.
 689
 690     String literals (whether bytes or text) can come in multiple adjacent
 691     tokens which are automatically concatenated, like in C or Python.  This
 692     method only consumes one token.
 693     """
 694     text = self.token
 695     if len(text) < 1 or text[0] not in ('\'', '"'):
 696       raise self._ParseError('Expected string.')
 697
 698     if len(text) < 2 or text[-1] != text[0]:
 699       raise self._ParseError('String missing ending quote.')
 700
 701     try:
 702       result = text_encoding.CUnescape(text[1:-1])
 703     except ValueError, e:
 704       raise self._ParseError(str(e))
 705     self.NextToken()
 706     return result
 707
 708   def ConsumeEnum(self, field):
 709     try:
 710       result = ParseEnum(field, self.token)
 711     except ValueError, e:
 712       raise self._ParseError(str(e))
 713     self.NextToken()
 714     return result
 715
 716   def ParseErrorPreviousToken(self, message):
 717     """Creates and *returns* a ParseError for the previously read token.
 718
 719     Args:
 720       message: A message to set for the exception.
 721
 722     Returns:
 723       A ParseError instance.
 724     """
 725     return ParseError('%d:%d : %s' % (
 726         self._previous_line + 1, self._previous_column + 1, message))
 727
 728   def _ParseError(self, message):
 729     """Creates and *returns* a ParseError for the current token."""
 730     return ParseError('%d:%d : %s' % (
 731         self._line + 1, self._column + 1, message))
 732
 733   def _StringParseError(self, e):
 734     return self._ParseError('Couldn\'t parse string: ' + str(e))
 735
 736   def NextToken(self):
 737     """Reads the next meaningful token."""
 738     self._previous_line = self._line
 739     self._previous_column = self._column
 740
 741     self._column += len(self.token)
 742     self._SkipWhitespace()
 743
 744     if not self._more_lines:
 745       self.token = ''
 746       return
 747
 748     match = self._TOKEN.match(self._current_line, self._column)
 749     if match:
 750       token = match.group(0)
 751       self.token = token
 752     else:
 753       self.token = self._current_line[self._column]
 754
 755
 756 def ParseInteger(text, is_signed=False, is_long=False):
 757   """Parses an integer.
 758
 759   Args:
 760     text: The text to parse.
 761     is_signed: True if a signed integer must be parsed.
 762     is_long: True if a long integer must be parsed.
 763
 764   Returns:
 765     The integer value.
 766
 767   Raises:
 768     ValueError: Thrown Iff the text is not a valid integer.
 769   """
 770   # Do the actual parsing. Exception handling is propagated to caller.
 771   try:
 772     # We force 32-bit values to int and 64-bit values to long to make
 773     # alternate implementations where the distinction is more significant
 774     # (e.g. the C++ implementation) simpler.
 775     if is_long:
 776       result = long(text, 0)
 777     else:
 778       result = int(text, 0)
 779   except ValueError:
 780     raise ValueError('Couldn\'t parse integer: %s' % text)
 781
 782   # Check if the integer is sane. Exceptions handled by callers.
 783   checker = _INTEGER_CHECKERS[2 * int(is_long) + int(is_signed)]
 784   checker.CheckValue(result)
 785   return result
 786
 787
 788 def ParseFloat(text):
 789   """Parse a floating point number.
 790
 791   Args:
 792     text: Text to parse.
 793
 794   Returns:
 795     The number parsed.
 796
 797   Raises:
 798     ValueError: If a floating point number couldn't be parsed.
 799   """
 800   try:
 801     # Assume Python compatible syntax.
 802     return float(text)
 803   except ValueError:
 804     # Check alternative spellings.
 805     if _FLOAT_INFINITY.match(text):
 806       if text[0] == '-':
 807         return float('-inf')
 808       else:
 809         return float('inf')
 810     elif _FLOAT_NAN.match(text):
 811       return float('nan')
 812     else:
 813       # assume '1.0f' format
 814       try:
 815         return float(text.rstrip('f'))
 816       except ValueError:
 817         raise ValueError('Couldn\'t parse float: %s' % text)
 818
 819
 820 def ParseBool(text):
 821   """Parse a boolean value.
 822
 823   Args:
 824     text: Text to parse.
 825
 826   Returns:
 827     Boolean values parsed
 828
 829   Raises:
 830     ValueError: If text is not a valid boolean.
 831   """
 832   if text in ('true', 't', '1'):
 833     return True
 834   elif text in ('false', 'f', '0'):
 835     return False
 836   else:
 837     raise ValueError('Expected "true" or "false".')
 838
 839
 840 def ParseEnum(field, value):
 841   """Parse an enum value.
 842
 843   The value can be specified by a number (the enum value), or by
 844   a string literal (the enum name).
 845
 846   Args:
 847     field: Enum field descriptor.
 848     value: String value.
 849
 850   Returns:
 851     Enum value number.
 852
 853   Raises:
 854     ValueError: If the enum value could not be parsed.
 855   """
 856   enum_descriptor = field.enum_type
 857   try:
 858     number = int(value, 0)
 859   except ValueError:
 860     # Identifier.
 861     enum_value = enum_descriptor.values_by_name.get(value, None)
 862     if enum_value is None:
 863       raise ValueError(
 864           'Enum type "%s" has no value named %s.' % (
 865               enum_descriptor.full_name, value))
 866   else:
 867     # Numeric value.
 868     enum_value = enum_descriptor.values_by_number.get(number, None)
 869     if enum_value is None:
 870       raise ValueError(
 871           'Enum type "%s" has no value with number %d.' % (
 872               enum_descriptor.full_name, number))
 873   return enum_value.number