1 # Protocol Buffers - Google's data interchange format
2 # Copyright 2008 Google Inc. All rights reserved.
3 # https://developers.google.com/protocol-buffers/
5 # Redistribution and use in source and binary forms, with or without
6 # modification, are permitted provided that the following conditions are
9 # * Redistributions of source code must retain the above copyright
10 # notice, this list of conditions and the following disclaimer.
11 # * Redistributions in binary form must reproduce the above
12 # copyright notice, this list of conditions and the following disclaimer
13 # in the documentation and/or other materials provided with the
15 # * Neither the name of Google Inc. nor the names of its
16 # contributors may be used to endorse or promote products derived from
17 # this software without specific prior written permission.
19 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21 # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22 # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23 # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26 # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27 # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 #PY25 compatible for GAE.
33 # Copyright 2007 Google Inc. All Rights Reserved.
35 """Contains routines for printing protocol messages in text format."""
37 __author__ = 'kenton@google.com (Kenton Varda)'
42 from google.protobuf.internal import type_checkers
43 from google.protobuf import descriptor
44 from google.protobuf import text_encoding
46 __all__ = ['MessageToString', 'PrintMessage', 'PrintField',
47 'PrintFieldValue', 'Merge']
50 _INTEGER_CHECKERS = (type_checkers.Uint32ValueChecker(),
51 type_checkers.Int32ValueChecker(),
52 type_checkers.Uint64ValueChecker(),
53 type_checkers.Int64ValueChecker())
54 _FLOAT_INFINITY = re.compile('-?inf(?:inity)?f?', re.IGNORECASE)
55 _FLOAT_NAN = re.compile('nanf?', re.IGNORECASE)
56 _FLOAT_TYPES = frozenset([descriptor.FieldDescriptor.CPPTYPE_FLOAT,
57 descriptor.FieldDescriptor.CPPTYPE_DOUBLE])
60 class Error(Exception):
61 """Top-level module error for text_format."""
64 class ParseError(Error):
65 """Thrown in case of ASCII parsing error."""
68 def MessageToString(message, as_utf8=False, as_one_line=False,
69 pointy_brackets=False, use_index_order=False,
71 """Convert protobuf message to text format.
73 Floating point values can be formatted compactly with 15 digits of
74 precision (which is the most that IEEE 754 "double" can guarantee)
75 using float_format='.15g'.
78 message: The protocol buffers message.
79 as_utf8: Produce text output in UTF8 format.
80 as_one_line: Don't introduce newlines between fields.
81 pointy_brackets: If True, use angle brackets instead of curly braces for
83 use_index_order: If True, print fields of a proto message using the order
84 defined in source code instead of the field number. By default, use the
86 float_format: If set, use this to specify floating point number formatting
87 (per the "Format Specification Mini-Language"); otherwise, str() is used.
90 A string of the text formatted protocol buffer message.
92 out = cStringIO.StringIO()
93 PrintMessage(message, out, as_utf8=as_utf8, as_one_line=as_one_line,
94 pointy_brackets=pointy_brackets,
95 use_index_order=use_index_order,
96 float_format=float_format)
97 result = out.getvalue()
100 return result.rstrip()
104 def PrintMessage(message, out, indent=0, as_utf8=False, as_one_line=False,
105 pointy_brackets=False, use_index_order=False,
107 fields = message.ListFields()
109 fields.sort(key=lambda x: x[0].index)
110 for field, value in fields:
111 if field.label == descriptor.FieldDescriptor.LABEL_REPEATED:
112 for element in value:
113 PrintField(field, element, out, indent, as_utf8, as_one_line,
114 pointy_brackets=pointy_brackets,
115 float_format=float_format)
117 PrintField(field, value, out, indent, as_utf8, as_one_line,
118 pointy_brackets=pointy_brackets,
119 float_format=float_format)
122 def PrintField(field, value, out, indent=0, as_utf8=False, as_one_line=False,
123 pointy_brackets=False, float_format=None):
124 """Print a single field name/value pair. For repeated fields, the value
125 should be a single element."""
127 out.write(' ' * indent)
128 if field.is_extension:
130 if (field.containing_type.GetOptions().message_set_wire_format and
131 field.type == descriptor.FieldDescriptor.TYPE_MESSAGE and
132 field.message_type == field.extension_scope and
133 field.label == descriptor.FieldDescriptor.LABEL_OPTIONAL):
134 out.write(field.message_type.full_name)
136 out.write(field.full_name)
138 elif field.type == descriptor.FieldDescriptor.TYPE_GROUP:
139 # For groups, use the capitalized name.
140 out.write(field.message_type.name)
142 out.write(field.name)
144 if field.cpp_type != descriptor.FieldDescriptor.CPPTYPE_MESSAGE:
145 # The colon is optional in this case, but our cross-language golden files
149 PrintFieldValue(field, value, out, indent, as_utf8, as_one_line,
150 pointy_brackets=pointy_brackets,
151 float_format=float_format)
158 def PrintFieldValue(field, value, out, indent=0, as_utf8=False,
159 as_one_line=False, pointy_brackets=False,
161 """Print a single field value (not including name). For repeated fields,
162 the value should be a single element."""
171 if field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_MESSAGE:
173 out.write(' %s ' % openb)
174 PrintMessage(value, out, indent, as_utf8, as_one_line,
175 pointy_brackets=pointy_brackets,
176 float_format=float_format)
179 out.write(' %s\n' % openb)
180 PrintMessage(value, out, indent + 2, as_utf8, as_one_line,
181 pointy_brackets=pointy_brackets,
182 float_format=float_format)
183 out.write(' ' * indent + closeb)
184 elif field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_ENUM:
185 enum_value = field.enum_type.values_by_number.get(value, None)
186 if enum_value is not None:
187 out.write(enum_value.name)
189 out.write(str(value))
190 elif field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_STRING:
192 if isinstance(value, unicode):
193 out_value = value.encode('utf-8')
196 if field.type == descriptor.FieldDescriptor.TYPE_BYTES:
197 # We need to escape non-UTF8 chars in TYPE_BYTES field.
200 out_as_utf8 = as_utf8
201 out.write(text_encoding.CEscape(out_value, out_as_utf8))
203 elif field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_BOOL:
208 elif field.cpp_type in _FLOAT_TYPES and float_format is not None:
209 out.write('{1:{0}}'.format(float_format, value))
211 out.write(str(value))
214 def _ParseOrMerge(lines, message, allow_multiple_scalars):
215 """Converts an ASCII representation of a protocol message into a message.
218 lines: Lines of a message's ASCII representation.
219 message: A protocol buffer message to merge into.
220 allow_multiple_scalars: Determines if repeated values for a non-repeated
221 field are permitted, e.g., the string "foo: 1 foo: 2" for a
222 required/optional field named "foo".
225 ParseError: On ASCII parsing problems.
227 tokenizer = _Tokenizer(lines)
228 while not tokenizer.AtEnd():
229 _MergeField(tokenizer, message, allow_multiple_scalars)
232 def Parse(text, message):
233 """Parses an ASCII representation of a protocol message into a message.
236 text: Message ASCII representation.
237 message: A protocol buffer message to merge into.
240 The same message passed as argument.
243 ParseError: On ASCII parsing problems.
245 if not isinstance(text, str): text = text.decode('utf-8')
246 return ParseLines(text.split('\n'), message)
249 def Merge(text, message):
250 """Parses an ASCII representation of a protocol message into a message.
252 Like Parse(), but allows repeated values for a non-repeated field, and uses
256 text: Message ASCII representation.
257 message: A protocol buffer message to merge into.
260 The same message passed as argument.
263 ParseError: On ASCII parsing problems.
265 return MergeLines(text.split('\n'), message)
268 def ParseLines(lines, message):
269 """Parses an ASCII representation of a protocol message into a message.
272 lines: An iterable of lines of a message's ASCII representation.
273 message: A protocol buffer message to merge into.
276 The same message passed as argument.
279 ParseError: On ASCII parsing problems.
281 _ParseOrMerge(lines, message, False)
285 def MergeLines(lines, message):
286 """Parses an ASCII representation of a protocol message into a message.
289 lines: An iterable of lines of a message's ASCII representation.
290 message: A protocol buffer message to merge into.
293 The same message passed as argument.
296 ParseError: On ASCII parsing problems.
298 _ParseOrMerge(lines, message, True)
302 def _MergeField(tokenizer, message, allow_multiple_scalars):
303 """Merges a single protocol message field into a message.
306 tokenizer: A tokenizer to parse the field name and values.
307 message: A protocol message to record the data.
308 allow_multiple_scalars: Determines if repeated values for a non-repeated
309 field are permitted, e.g., the string "foo: 1 foo: 2" for a
310 required/optional field named "foo".
313 ParseError: In case of ASCII parsing problems.
315 message_descriptor = message.DESCRIPTOR
316 if tokenizer.TryConsume('['):
317 name = [tokenizer.ConsumeIdentifier()]
318 while tokenizer.TryConsume('.'):
319 name.append(tokenizer.ConsumeIdentifier())
320 name = '.'.join(name)
322 if not message_descriptor.is_extendable:
323 raise tokenizer.ParseErrorPreviousToken(
324 'Message type "%s" does not have extensions.' %
325 message_descriptor.full_name)
326 # pylint: disable=protected-access
327 field = message.Extensions._FindExtensionByName(name)
328 # pylint: enable=protected-access
330 raise tokenizer.ParseErrorPreviousToken(
331 'Extension "%s" not registered.' % name)
332 elif message_descriptor != field.containing_type:
333 raise tokenizer.ParseErrorPreviousToken(
334 'Extension "%s" does not extend message type "%s".' % (
335 name, message_descriptor.full_name))
336 tokenizer.Consume(']')
338 name = tokenizer.ConsumeIdentifier()
339 field = message_descriptor.fields_by_name.get(name, None)
341 # Group names are expected to be capitalized as they appear in the
342 # .proto file, which actually matches their type names, not their field
345 field = message_descriptor.fields_by_name.get(name.lower(), None)
346 if field and field.type != descriptor.FieldDescriptor.TYPE_GROUP:
349 if (field and field.type == descriptor.FieldDescriptor.TYPE_GROUP and
350 field.message_type.name != name):
354 raise tokenizer.ParseErrorPreviousToken(
355 'Message type "%s" has no field named "%s".' % (
356 message_descriptor.full_name, name))
358 if field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_MESSAGE:
359 tokenizer.TryConsume(':')
361 if tokenizer.TryConsume('<'):
364 tokenizer.Consume('{')
367 if field.label == descriptor.FieldDescriptor.LABEL_REPEATED:
368 if field.is_extension:
369 sub_message = message.Extensions[field].add()
371 sub_message = getattr(message, field.name).add()
373 if field.is_extension:
374 sub_message = message.Extensions[field]
376 sub_message = getattr(message, field.name)
377 sub_message.SetInParent()
379 while not tokenizer.TryConsume(end_token):
380 if tokenizer.AtEnd():
381 raise tokenizer.ParseErrorPreviousToken('Expected "%s".' % (end_token))
382 _MergeField(tokenizer, sub_message, allow_multiple_scalars)
384 _MergeScalarField(tokenizer, message, field, allow_multiple_scalars)
386 # For historical reasons, fields may optionally be separated by commas or
388 if not tokenizer.TryConsume(','):
389 tokenizer.TryConsume(';')
392 def _MergeScalarField(tokenizer, message, field, allow_multiple_scalars):
393 """Merges a single protocol message scalar field into a message.
396 tokenizer: A tokenizer to parse the field value.
397 message: A protocol message to record the data.
398 field: The descriptor of the field to be merged.
399 allow_multiple_scalars: Determines if repeated values for a non-repeated
400 field are permitted, e.g., the string "foo: 1 foo: 2" for a
401 required/optional field named "foo".
404 ParseError: In case of ASCII parsing problems.
405 RuntimeError: On runtime errors.
407 tokenizer.Consume(':')
410 if field.type in (descriptor.FieldDescriptor.TYPE_INT32,
411 descriptor.FieldDescriptor.TYPE_SINT32,
412 descriptor.FieldDescriptor.TYPE_SFIXED32):
413 value = tokenizer.ConsumeInt32()
414 elif field.type in (descriptor.FieldDescriptor.TYPE_INT64,
415 descriptor.FieldDescriptor.TYPE_SINT64,
416 descriptor.FieldDescriptor.TYPE_SFIXED64):
417 value = tokenizer.ConsumeInt64()
418 elif field.type in (descriptor.FieldDescriptor.TYPE_UINT32,
419 descriptor.FieldDescriptor.TYPE_FIXED32):
420 value = tokenizer.ConsumeUint32()
421 elif field.type in (descriptor.FieldDescriptor.TYPE_UINT64,
422 descriptor.FieldDescriptor.TYPE_FIXED64):
423 value = tokenizer.ConsumeUint64()
424 elif field.type in (descriptor.FieldDescriptor.TYPE_FLOAT,
425 descriptor.FieldDescriptor.TYPE_DOUBLE):
426 value = tokenizer.ConsumeFloat()
427 elif field.type == descriptor.FieldDescriptor.TYPE_BOOL:
428 value = tokenizer.ConsumeBool()
429 elif field.type == descriptor.FieldDescriptor.TYPE_STRING:
430 value = tokenizer.ConsumeString()
431 elif field.type == descriptor.FieldDescriptor.TYPE_BYTES:
432 value = tokenizer.ConsumeByteString()
433 elif field.type == descriptor.FieldDescriptor.TYPE_ENUM:
434 value = tokenizer.ConsumeEnum(field)
436 raise RuntimeError('Unknown field type %d' % field.type)
438 if field.label == descriptor.FieldDescriptor.LABEL_REPEATED:
439 if field.is_extension:
440 message.Extensions[field].append(value)
442 getattr(message, field.name).append(value)
444 if field.is_extension:
445 if not allow_multiple_scalars and message.HasExtension(field):
446 raise tokenizer.ParseErrorPreviousToken(
447 'Message type "%s" should not have multiple "%s" extensions.' %
448 (message.DESCRIPTOR.full_name, field.full_name))
450 message.Extensions[field] = value
452 if not allow_multiple_scalars and message.HasField(field.name):
453 raise tokenizer.ParseErrorPreviousToken(
454 'Message type "%s" should not have multiple "%s" fields.' %
455 (message.DESCRIPTOR.full_name, field.name))
457 setattr(message, field.name, value)
460 class _Tokenizer(object):
461 """Protocol buffer ASCII representation tokenizer.
463 This class handles the lower level string parsing by splitting it into
466 It was directly ported from the Java protocol buffer API.
469 _WHITESPACE = re.compile('(\\s|(#.*$))+', re.MULTILINE)
471 '[a-zA-Z_][0-9a-zA-Z_+-]*|' # an identifier
472 '[0-9+-][0-9a-zA-Z_.+-]*|' # a number
473 '\"([^\"\n\\\\]|\\\\.)*(\"|\\\\?$)|' # a double-quoted string
474 '\'([^\'\n\\\\]|\\\\.)*(\'|\\\\?$)') # a single-quoted string
475 _IDENTIFIER = re.compile(r'\w+')
477 def __init__(self, lines):
481 self._token_start = None
483 self._lines = iter(lines)
484 self._current_line = ''
485 self._previous_line = 0
486 self._previous_column = 0
487 self._more_lines = True
488 self._SkipWhitespace()
492 """Checks the end of the text was reached.
495 True iff the end was reached.
497 return not self.token
500 while len(self._current_line) <= self._column:
502 self._current_line = self._lines.next()
503 except StopIteration:
504 self._current_line = ''
505 self._more_lines = False
511 def _SkipWhitespace(self):
514 match = self._WHITESPACE.match(self._current_line, self._column)
517 length = len(match.group(0))
518 self._column += length
520 def TryConsume(self, token):
521 """Tries to consume a given piece of text.
524 token: Text to consume.
527 True iff the text was consumed.
529 if self.token == token:
534 def Consume(self, token):
535 """Consumes a piece of text.
538 token: Text to consume.
541 ParseError: If the text couldn't be consumed.
543 if not self.TryConsume(token):
544 raise self._ParseError('Expected "%s".' % token)
546 def ConsumeIdentifier(self):
547 """Consumes protocol message field identifier.
553 ParseError: If an identifier couldn't be consumed.
556 if not self._IDENTIFIER.match(result):
557 raise self._ParseError('Expected identifier.')
561 def ConsumeInt32(self):
562 """Consumes a signed 32bit integer number.
568 ParseError: If a signed 32bit integer couldn't be consumed.
571 result = ParseInteger(self.token, is_signed=True, is_long=False)
572 except ValueError, e:
573 raise self._ParseError(str(e))
577 def ConsumeUint32(self):
578 """Consumes an unsigned 32bit integer number.
584 ParseError: If an unsigned 32bit integer couldn't be consumed.
587 result = ParseInteger(self.token, is_signed=False, is_long=False)
588 except ValueError, e:
589 raise self._ParseError(str(e))
593 def ConsumeInt64(self):
594 """Consumes a signed 64bit integer number.
600 ParseError: If a signed 64bit integer couldn't be consumed.
603 result = ParseInteger(self.token, is_signed=True, is_long=True)
604 except ValueError, e:
605 raise self._ParseError(str(e))
609 def ConsumeUint64(self):
610 """Consumes an unsigned 64bit integer number.
616 ParseError: If an unsigned 64bit integer couldn't be consumed.
619 result = ParseInteger(self.token, is_signed=False, is_long=True)
620 except ValueError, e:
621 raise self._ParseError(str(e))
625 def ConsumeFloat(self):
626 """Consumes an floating point number.
632 ParseError: If a floating point number couldn't be consumed.
635 result = ParseFloat(self.token)
636 except ValueError, e:
637 raise self._ParseError(str(e))
641 def ConsumeBool(self):
642 """Consumes a boolean value.
648 ParseError: If a boolean value couldn't be consumed.
651 result = ParseBool(self.token)
652 except ValueError, e:
653 raise self._ParseError(str(e))
657 def ConsumeString(self):
658 """Consumes a string value.
664 ParseError: If a string value couldn't be consumed.
666 the_bytes = self.ConsumeByteString()
668 return unicode(the_bytes, 'utf-8')
669 except UnicodeDecodeError, e:
670 raise self._StringParseError(e)
672 def ConsumeByteString(self):
673 """Consumes a byte array value.
676 The array parsed (as a string).
679 ParseError: If a byte array value couldn't be consumed.
681 the_list = [self._ConsumeSingleByteString()]
682 while self.token and self.token[0] in ('\'', '"'):
683 the_list.append(self._ConsumeSingleByteString())
684 return ''.encode('latin1').join(the_list) ##PY25
685 ##!PY25 return b''.join(the_list)
687 def _ConsumeSingleByteString(self):
688 """Consume one token of a string literal.
690 String literals (whether bytes or text) can come in multiple adjacent
691 tokens which are automatically concatenated, like in C or Python. This
692 method only consumes one token.
695 if len(text) < 1 or text[0] not in ('\'', '"'):
696 raise self._ParseError('Expected string.')
698 if len(text) < 2 or text[-1] != text[0]:
699 raise self._ParseError('String missing ending quote.')
702 result = text_encoding.CUnescape(text[1:-1])
703 except ValueError, e:
704 raise self._ParseError(str(e))
708 def ConsumeEnum(self, field):
710 result = ParseEnum(field, self.token)
711 except ValueError, e:
712 raise self._ParseError(str(e))
716 def ParseErrorPreviousToken(self, message):
717 """Creates and *returns* a ParseError for the previously read token.
720 message: A message to set for the exception.
723 A ParseError instance.
725 return ParseError('%d:%d : %s' % (
726 self._previous_line + 1, self._previous_column + 1, message))
728 def _ParseError(self, message):
729 """Creates and *returns* a ParseError for the current token."""
730 return ParseError('%d:%d : %s' % (
731 self._line + 1, self._column + 1, message))
733 def _StringParseError(self, e):
734 return self._ParseError('Couldn\'t parse string: ' + str(e))
737 """Reads the next meaningful token."""
738 self._previous_line = self._line
739 self._previous_column = self._column
741 self._column += len(self.token)
742 self._SkipWhitespace()
744 if not self._more_lines:
748 match = self._TOKEN.match(self._current_line, self._column)
750 token = match.group(0)
753 self.token = self._current_line[self._column]
756 def ParseInteger(text, is_signed=False, is_long=False):
757 """Parses an integer.
760 text: The text to parse.
761 is_signed: True if a signed integer must be parsed.
762 is_long: True if a long integer must be parsed.
768 ValueError: Thrown Iff the text is not a valid integer.
770 # Do the actual parsing. Exception handling is propagated to caller.
772 # We force 32-bit values to int and 64-bit values to long to make
773 # alternate implementations where the distinction is more significant
774 # (e.g. the C++ implementation) simpler.
776 result = long(text, 0)
778 result = int(text, 0)
780 raise ValueError('Couldn\'t parse integer: %s' % text)
782 # Check if the integer is sane. Exceptions handled by callers.
783 checker = _INTEGER_CHECKERS[2 * int(is_long) + int(is_signed)]
784 checker.CheckValue(result)
788 def ParseFloat(text):
789 """Parse a floating point number.
798 ValueError: If a floating point number couldn't be parsed.
801 # Assume Python compatible syntax.
804 # Check alternative spellings.
805 if _FLOAT_INFINITY.match(text):
810 elif _FLOAT_NAN.match(text):
813 # assume '1.0f' format
815 return float(text.rstrip('f'))
817 raise ValueError('Couldn\'t parse float: %s' % text)
821 """Parse a boolean value.
827 Boolean values parsed
830 ValueError: If text is not a valid boolean.
832 if text in ('true', 't', '1'):
834 elif text in ('false', 'f', '0'):
837 raise ValueError('Expected "true" or "false".')
840 def ParseEnum(field, value):
841 """Parse an enum value.
843 The value can be specified by a number (the enum value), or by
844 a string literal (the enum name).
847 field: Enum field descriptor.
854 ValueError: If the enum value could not be parsed.
856 enum_descriptor = field.enum_type
858 number = int(value, 0)
861 enum_value = enum_descriptor.values_by_name.get(value, None)
862 if enum_value is None:
864 'Enum type "%s" has no value named %s.' % (
865 enum_descriptor.full_name, value))
868 enum_value = enum_descriptor.values_by_number.get(number, None)
869 if enum_value is None:
871 'Enum type "%s" has no value with number %d.' % (
872 enum_descriptor.full_name, number))
873 return enum_value.number