1 # Protocol Buffers - Google's data interchange format
2 # Copyright 2008 Google Inc. All rights reserved.
3 # http://code.google.com/p/protobuf/
5 # Redistribution and use in source and binary forms, with or without
6 # modification, are permitted provided that the following conditions are
9 # * Redistributions of source code must retain the above copyright
10 # notice, this list of conditions and the following disclaimer.
11 # * Redistributions in binary form must reproduce the above
12 # copyright notice, this list of conditions and the following disclaimer
13 # in the documentation and/or other materials provided with the
15 # * Neither the name of Google Inc. nor the names of its
16 # contributors may be used to endorse or promote products derived from
17 # this software without specific prior written permission.
19 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21 # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22 # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23 # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26 # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27 # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 """Contains routines for printing protocol messages in text format."""
33 __author__ = 'kenton@google.com (Kenton Varda)'
38 from collections import deque
39 from google.protobuf.internal import type_checkers
40 from google.protobuf import descriptor
42 __all__ = [ 'MessageToString', 'PrintMessage', 'PrintField',
43 'PrintFieldValue', 'Merge' ]
46 _INTEGER_CHECKERS = (type_checkers.Uint32ValueChecker(),
47 type_checkers.Int32ValueChecker(),
48 type_checkers.Uint64ValueChecker(),
49 type_checkers.Int64ValueChecker())
50 _FLOAT_INFINITY = re.compile('-?inf(?:inity)?f?', re.IGNORECASE)
51 _FLOAT_NAN = re.compile('nanf?', re.IGNORECASE)
54 class ParseError(Exception):
55 """Thrown in case of ASCII parsing error."""
58 def MessageToString(message, as_utf8=False, as_one_line=False):
59 out = cStringIO.StringIO()
60 PrintMessage(message, out, as_utf8=as_utf8, as_one_line=as_one_line)
61 result = out.getvalue()
64 return result.rstrip()
68 def PrintMessage(message, out, indent=0, as_utf8=False, as_one_line=False):
69 for field, value in message.ListFields():
70 if field.label == descriptor.FieldDescriptor.LABEL_REPEATED:
72 PrintField(field, element, out, indent, as_utf8, as_one_line)
74 PrintField(field, value, out, indent, as_utf8, as_one_line)
77 def PrintField(field, value, out, indent=0, as_utf8=False, as_one_line=False):
78 """Print a single field name/value pair. For repeated fields, the value
79 should be a single element."""
81 out.write(' ' * indent);
82 if field.is_extension:
84 if (field.containing_type.GetOptions().message_set_wire_format and
85 field.type == descriptor.FieldDescriptor.TYPE_MESSAGE and
86 field.message_type == field.extension_scope and
87 field.label == descriptor.FieldDescriptor.LABEL_OPTIONAL):
88 out.write(field.message_type.full_name)
90 out.write(field.full_name)
92 elif field.type == descriptor.FieldDescriptor.TYPE_GROUP:
93 # For groups, use the capitalized name.
94 out.write(field.message_type.name)
98 if field.cpp_type != descriptor.FieldDescriptor.CPPTYPE_MESSAGE:
99 # The colon is optional in this case, but our cross-language golden files
103 PrintFieldValue(field, value, out, indent, as_utf8, as_one_line)
110 def PrintFieldValue(field, value, out, indent=0,
111 as_utf8=False, as_one_line=False):
112 """Print a single field value (not including name). For repeated fields,
113 the value should be a single element."""
115 if field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_MESSAGE:
118 PrintMessage(value, out, indent, as_utf8, as_one_line)
122 PrintMessage(value, out, indent + 2, as_utf8, as_one_line)
123 out.write(' ' * indent + '}')
124 elif field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_ENUM:
125 enum_value = field.enum_type.values_by_number.get(value, None)
126 if enum_value is not None:
127 out.write(enum_value.name)
129 out.write(str(value))
130 elif field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_STRING:
132 if type(value) is unicode:
133 out.write(_CEscape(value.encode('utf-8'), as_utf8))
135 out.write(_CEscape(value, as_utf8))
137 elif field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_BOOL:
143 out.write(str(value))
146 def Merge(text, message):
147 """Merges an ASCII representation of a protocol message into a message.
150 text: Message ASCII representation.
151 message: A protocol buffer message to merge into.
154 ParseError: On ASCII parsing problems.
156 tokenizer = _Tokenizer(text)
157 while not tokenizer.AtEnd():
158 _MergeField(tokenizer, message)
161 def _MergeField(tokenizer, message):
162 """Merges a single protocol message field into a message.
165 tokenizer: A tokenizer to parse the field name and values.
166 message: A protocol message to record the data.
169 ParseError: In case of ASCII parsing problems.
171 message_descriptor = message.DESCRIPTOR
172 if tokenizer.TryConsume('['):
173 name = [tokenizer.ConsumeIdentifier()]
174 while tokenizer.TryConsume('.'):
175 name.append(tokenizer.ConsumeIdentifier())
176 name = '.'.join(name)
178 if not message_descriptor.is_extendable:
179 raise tokenizer.ParseErrorPreviousToken(
180 'Message type "%s" does not have extensions.' %
181 message_descriptor.full_name)
182 field = message.Extensions._FindExtensionByName(name)
184 raise tokenizer.ParseErrorPreviousToken(
185 'Extension "%s" not registered.' % name)
186 elif message_descriptor != field.containing_type:
187 raise tokenizer.ParseErrorPreviousToken(
188 'Extension "%s" does not extend message type "%s".' % (
189 name, message_descriptor.full_name))
190 tokenizer.Consume(']')
192 name = tokenizer.ConsumeIdentifier()
193 field = message_descriptor.fields_by_name.get(name, None)
195 # Group names are expected to be capitalized as they appear in the
196 # .proto file, which actually matches their type names, not their field
199 field = message_descriptor.fields_by_name.get(name.lower(), None)
200 if field and field.type != descriptor.FieldDescriptor.TYPE_GROUP:
203 if (field and field.type == descriptor.FieldDescriptor.TYPE_GROUP and
204 field.message_type.name != name):
208 raise tokenizer.ParseErrorPreviousToken(
209 'Message type "%s" has no field named "%s".' % (
210 message_descriptor.full_name, name))
212 if field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_MESSAGE:
213 tokenizer.TryConsume(':')
215 if tokenizer.TryConsume('<'):
218 tokenizer.Consume('{')
221 if field.label == descriptor.FieldDescriptor.LABEL_REPEATED:
222 if field.is_extension:
223 sub_message = message.Extensions[field].add()
225 sub_message = getattr(message, field.name).add()
227 if field.is_extension:
228 sub_message = message.Extensions[field]
230 sub_message = getattr(message, field.name)
231 sub_message.SetInParent()
233 while not tokenizer.TryConsume(end_token):
234 if tokenizer.AtEnd():
235 raise tokenizer.ParseErrorPreviousToken('Expected "%s".' % (end_token))
236 _MergeField(tokenizer, sub_message)
238 _MergeScalarField(tokenizer, message, field)
241 def _MergeScalarField(tokenizer, message, field):
242 """Merges a single protocol message scalar field into a message.
245 tokenizer: A tokenizer to parse the field value.
246 message: A protocol message to record the data.
247 field: The descriptor of the field to be merged.
250 ParseError: In case of ASCII parsing problems.
251 RuntimeError: On runtime errors.
253 tokenizer.Consume(':')
256 if field.type in (descriptor.FieldDescriptor.TYPE_INT32,
257 descriptor.FieldDescriptor.TYPE_SINT32,
258 descriptor.FieldDescriptor.TYPE_SFIXED32):
259 value = tokenizer.ConsumeInt32()
260 elif field.type in (descriptor.FieldDescriptor.TYPE_INT64,
261 descriptor.FieldDescriptor.TYPE_SINT64,
262 descriptor.FieldDescriptor.TYPE_SFIXED64):
263 value = tokenizer.ConsumeInt64()
264 elif field.type in (descriptor.FieldDescriptor.TYPE_UINT32,
265 descriptor.FieldDescriptor.TYPE_FIXED32):
266 value = tokenizer.ConsumeUint32()
267 elif field.type in (descriptor.FieldDescriptor.TYPE_UINT64,
268 descriptor.FieldDescriptor.TYPE_FIXED64):
269 value = tokenizer.ConsumeUint64()
270 elif field.type in (descriptor.FieldDescriptor.TYPE_FLOAT,
271 descriptor.FieldDescriptor.TYPE_DOUBLE):
272 value = tokenizer.ConsumeFloat()
273 elif field.type == descriptor.FieldDescriptor.TYPE_BOOL:
274 value = tokenizer.ConsumeBool()
275 elif field.type == descriptor.FieldDescriptor.TYPE_STRING:
276 value = tokenizer.ConsumeString()
277 elif field.type == descriptor.FieldDescriptor.TYPE_BYTES:
278 value = tokenizer.ConsumeByteString()
279 elif field.type == descriptor.FieldDescriptor.TYPE_ENUM:
280 value = tokenizer.ConsumeEnum(field)
282 raise RuntimeError('Unknown field type %d' % field.type)
284 if field.label == descriptor.FieldDescriptor.LABEL_REPEATED:
285 if field.is_extension:
286 message.Extensions[field].append(value)
288 getattr(message, field.name).append(value)
290 if field.is_extension:
291 message.Extensions[field] = value
293 setattr(message, field.name, value)
296 class _Tokenizer(object):
297 """Protocol buffer ASCII representation tokenizer.
299 This class handles the lower level string parsing by splitting it into
302 It was directly ported from the Java protocol buffer API.
305 _WHITESPACE = re.compile('(\\s|(#.*$))+', re.MULTILINE)
307 '[a-zA-Z_][0-9a-zA-Z_+-]*|' # an identifier
308 '[0-9+-][0-9a-zA-Z_.+-]*|' # a number
309 '\"([^\"\n\\\\]|\\\\.)*(\"|\\\\?$)|' # a double-quoted string
310 '\'([^\'\n\\\\]|\\\\.)*(\'|\\\\?$)') # a single-quoted string
311 _IDENTIFIER = re.compile('\w+')
313 def __init__(self, text_message):
314 self._text_message = text_message
319 self._token_start = None
321 self._lines = deque(text_message.split('\n'))
322 self._current_line = ''
323 self._previous_line = 0
324 self._previous_column = 0
325 self._SkipWhitespace()
329 """Checks the end of the text was reached.
332 True iff the end was reached.
334 return self.token == ''
337 while len(self._current_line) <= self._column:
339 self._current_line = ''
343 self._current_line = self._lines.popleft()
345 def _SkipWhitespace(self):
348 match = self._WHITESPACE.match(self._current_line, self._column)
351 length = len(match.group(0))
352 self._column += length
354 def TryConsume(self, token):
355 """Tries to consume a given piece of text.
358 token: Text to consume.
361 True iff the text was consumed.
363 if self.token == token:
368 def Consume(self, token):
369 """Consumes a piece of text.
372 token: Text to consume.
375 ParseError: If the text couldn't be consumed.
377 if not self.TryConsume(token):
378 raise self._ParseError('Expected "%s".' % token)
380 def ConsumeIdentifier(self):
381 """Consumes protocol message field identifier.
387 ParseError: If an identifier couldn't be consumed.
390 if not self._IDENTIFIER.match(result):
391 raise self._ParseError('Expected identifier.')
395 def ConsumeInt32(self):
396 """Consumes a signed 32bit integer number.
402 ParseError: If a signed 32bit integer couldn't be consumed.
405 result = ParseInteger(self.token, is_signed=True, is_long=False)
406 except ValueError, e:
407 raise self._ParseError(str(e))
411 def ConsumeUint32(self):
412 """Consumes an unsigned 32bit integer number.
418 ParseError: If an unsigned 32bit integer couldn't be consumed.
421 result = ParseInteger(self.token, is_signed=False, is_long=False)
422 except ValueError, e:
423 raise self._ParseError(str(e))
427 def ConsumeInt64(self):
428 """Consumes a signed 64bit integer number.
434 ParseError: If a signed 64bit integer couldn't be consumed.
437 result = ParseInteger(self.token, is_signed=True, is_long=True)
438 except ValueError, e:
439 raise self._ParseError(str(e))
443 def ConsumeUint64(self):
444 """Consumes an unsigned 64bit integer number.
450 ParseError: If an unsigned 64bit integer couldn't be consumed.
453 result = ParseInteger(self.token, is_signed=False, is_long=True)
454 except ValueError, e:
455 raise self._ParseError(str(e))
459 def ConsumeFloat(self):
460 """Consumes an floating point number.
466 ParseError: If a floating point number couldn't be consumed.
469 result = ParseFloat(self.token)
470 except ValueError, e:
471 raise self._ParseError(str(e))
475 def ConsumeBool(self):
476 """Consumes a boolean value.
482 ParseError: If a boolean value couldn't be consumed.
485 result = ParseBool(self.token)
486 except ValueError, e:
487 raise self._ParseError(str(e))
491 def ConsumeString(self):
492 """Consumes a string value.
498 ParseError: If a string value couldn't be consumed.
500 bytes = self.ConsumeByteString()
502 return unicode(bytes, 'utf-8')
503 except UnicodeDecodeError, e:
504 raise self._StringParseError(e)
506 def ConsumeByteString(self):
507 """Consumes a byte array value.
510 The array parsed (as a string).
513 ParseError: If a byte array value couldn't be consumed.
515 list = [self._ConsumeSingleByteString()]
516 while len(self.token) > 0 and self.token[0] in ('\'', '"'):
517 list.append(self._ConsumeSingleByteString())
520 def _ConsumeSingleByteString(self):
521 """Consume one token of a string literal.
523 String literals (whether bytes or text) can come in multiple adjacent
524 tokens which are automatically concatenated, like in C or Python. This
525 method only consumes one token.
528 if len(text) < 1 or text[0] not in ('\'', '"'):
529 raise self._ParseError('Expected string.')
531 if len(text) < 2 or text[-1] != text[0]:
532 raise self._ParseError('String missing ending quote.')
535 result = _CUnescape(text[1:-1])
536 except ValueError, e:
537 raise self._ParseError(str(e))
541 def ConsumeEnum(self, field):
543 result = ParseEnum(field, self.token)
544 except ValueError, e:
545 raise self._ParseError(str(e))
549 def ParseErrorPreviousToken(self, message):
550 """Creates and *returns* a ParseError for the previously read token.
553 message: A message to set for the exception.
556 A ParseError instance.
558 return ParseError('%d:%d : %s' % (
559 self._previous_line + 1, self._previous_column + 1, message))
561 def _ParseError(self, message):
562 """Creates and *returns* a ParseError for the current token."""
563 return ParseError('%d:%d : %s' % (
564 self._line + 1, self._column + 1, message))
566 def _StringParseError(self, e):
567 return self._ParseError('Couldn\'t parse string: ' + str(e))
570 """Reads the next meaningful token."""
571 self._previous_line = self._line
572 self._previous_column = self._column
574 self._column += len(self.token)
575 self._SkipWhitespace()
577 if not self._lines and len(self._current_line) <= self._column:
581 match = self._TOKEN.match(self._current_line, self._column)
583 token = match.group(0)
586 self.token = self._current_line[self._column]
589 # text.encode('string_escape') does not seem to satisfy our needs as it
590 # encodes unprintable characters using two-digit hex escapes whereas our
591 # C++ unescaping function allows hex escapes to be any length. So,
592 # "\0011".encode('string_escape') ends up being "\\x011", which will be
593 # decoded in C++ as a single-character string with char code 0x11.
594 def _CEscape(text, as_utf8):
597 if o == 10: return r"\n" # optional escape
598 if o == 13: return r"\r" # optional escape
599 if o == 9: return r"\t" # optional escape
600 if o == 39: return r"\'" # optional escape
602 if o == 34: return r'\"' # necessary escape
603 if o == 92: return r"\\" # necessary escape
606 if not as_utf8 and (o >= 127 or o < 32): return "\\%03o" % o
608 return "".join([escape(c) for c in text])
611 _CUNESCAPE_HEX = re.compile('\\\\x([0-9a-fA-F]{2}|[0-9a-fA-F])')
614 def _CUnescape(text):
616 return chr(int(m.group(0)[2:], 16))
617 # This is required because the 'string_escape' encoding doesn't
618 # allow single-digit hex escapes (like '\xf').
619 result = _CUNESCAPE_HEX.sub(ReplaceHex, text)
620 return result.decode('string_escape')
623 def ParseInteger(text, is_signed=False, is_long=False):
624 """Parses an integer.
627 text: The text to parse.
628 is_signed: True if a signed integer must be parsed.
629 is_long: True if a long integer must be parsed.
635 ValueError: Thrown Iff the text is not a valid integer.
637 # Do the actual parsing. Exception handling is propagated to caller.
639 result = int(text, 0)
641 raise ValueError('Couldn\'t parse integer: %s' % text)
643 # Check if the integer is sane. Exceptions handled by callers.
644 checker = _INTEGER_CHECKERS[2 * int(is_long) + int(is_signed)]
645 checker.CheckValue(result)
649 def ParseFloat(text):
650 """Parse a floating point number.
659 ValueError: If a floating point number couldn't be parsed.
662 # Assume Python compatible syntax.
665 # Check alternative spellings.
666 if _FLOAT_INFINITY.match(text):
671 elif _FLOAT_NAN.match(text):
674 # assume '1.0f' format
676 return float(text.rstrip('f'))
678 raise ValueError('Couldn\'t parse float: %s' % text)
682 """Parse a boolean value.
688 Boolean values parsed
691 ValueError: If text is not a valid boolean.
693 if text in ('true', 't', '1'):
695 elif text in ('false', 'f', '0'):
698 raise ValueError('Expected "true" or "false".')
701 def ParseEnum(field, value):
702 """Parse an enum value.
704 The value can be specified by a number (the enum value), or by
705 a string literal (the enum name).
708 field: Enum field descriptor.
715 ValueError: If the enum value could not be parsed.
717 enum_descriptor = field.enum_type
719 number = int(value, 0)
722 enum_value = enum_descriptor.values_by_name.get(value, None)
723 if enum_value is None:
725 'Enum type "%s" has no value named %s.' % (
726 enum_descriptor.full_name, value))
729 enum_value = enum_descriptor.values_by_number.get(number, None)
730 if enum_value is None:
732 'Enum type "%s" has no value with number %d.' % (
733 enum_descriptor.full_name, number))
734 return enum_value.number