1 # Protocol Buffers - Google's data interchange format
2 # Copyright 2008 Google Inc. All rights reserved.
3 # http://code.google.com/p/protobuf/
5 # Redistribution and use in source and binary forms, with or without
6 # modification, are permitted provided that the following conditions are
9 # * Redistributions of source code must retain the above copyright
10 # notice, this list of conditions and the following disclaimer.
11 # * Redistributions in binary form must reproduce the above
12 # copyright notice, this list of conditions and the following disclaimer
13 # in the documentation and/or other materials provided with the
15 # * Neither the name of Google Inc. nor the names of its
16 # contributors may be used to endorse or promote products derived from
17 # this software without specific prior written permission.
19 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21 # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22 # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23 # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26 # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27 # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 """Code for decoding protocol buffer primitives.
33 This code is very similar to encoder.py -- read the docs for that module first.
35 A "decoder" is a function with the signature:
36 Decode(buffer, pos, end, message, field_dict)
38 buffer: The string containing the encoded message.
39 pos: The current position in the string.
40 end: The position in the string where the current message ends. May be
41 less than len(buffer) if we're reading a sub-message.
42 message: The message object into which we're parsing.
43 field_dict: message._fields (avoids a hashtable lookup).
44 The decoder reads the field and stores it into field_dict, returning the new
45 buffer position. A decoder for a repeated field may proactively decode all of
46 the elements of that field, if they appear consecutively.
48 Note that decoders may throw any of the following:
49 IndexError: Indicates a truncated message.
50 struct.error: Unpacking of a fixed-width field failed.
51 message.DecodeError: Other errors.
53 Decoders are expected to raise an exception if they are called with pos > end.
54 This allows callers to be lax about bounds checking: it's fineto read past
55 "end" as long as you are sure that someone else will notice and throw an
58 Something up the call stack is expected to catch IndexError and struct.error
59 and convert them to message.DecodeError.
61 Decoders are constructed using decoder constructors with the signature:
62 MakeDecoder(field_number, is_repeated, is_packed, key, new_default)
64 field_number: The field number of the field we want to decode.
65 is_repeated: Is the field a repeated field? (bool)
66 is_packed: Is the field a packed field? (bool)
67 key: The key to use when looking up the field within field_dict.
68 (This is actually the FieldDescriptor but nothing in this
69 file should depend on that.)
70 new_default: A function which takes a message object as a parameter and
71 returns a new instance of the default value for this field.
72 (This is called for repeated fields and sub-messages, when an
73 instance does not already exist.)
75 As with encoders, we define a decoder constructor for every type of field.
76 Then, for every field of every message class we construct an actual decoder.
77 That decoder goes into a dict indexed by tag, so when we decode a message
78 we repeatedly read a tag, look up the corresponding decoder, and invoke it.
81 __author__ = 'kenton@google.com (Kenton Varda)'
84 from google.protobuf.internal import encoder
85 from google.protobuf.internal import wire_format
86 from google.protobuf import message
89 # This will overflow and thus become IEEE-754 "infinity". We would use
90 # "float('inf')" but it doesn't work on Windows pre-Python-2.6.
96 # This is not for optimization, but rather to avoid conflicts with local
97 # variables named "message".
98 _DecodeError = message.DecodeError
101 def _VarintDecoder(mask):
102 """Return an encoder for a basic varint value (does not include tag).
104 Decoded values will be bitwise-anded with the given mask before being
105 returned, e.g. to limit them to 32 bits. The returned decoder does not
106 take the usual "end" parameter -- the caller is expected to do bounds checking
107 after the fact (often the caller can defer such checking until later). The
108 decoder returns a (value, new_pos) pair.
112 def DecodeVarint(buffer, pos):
116 b = local_ord(buffer[pos])
117 result |= ((b & 0x7f) << shift)
124 raise _DecodeError('Too many bytes when decoding varint.')
128 def _SignedVarintDecoder(mask):
129 """Like _VarintDecoder() but decodes signed values."""
132 def DecodeVarint(buffer, pos):
136 b = local_ord(buffer[pos])
137 result |= ((b & 0x7f) << shift)
140 if result > 0x7fffffffffffffff:
148 raise _DecodeError('Too many bytes when decoding varint.')
152 _DecodeVarint = _VarintDecoder((1 << 64) - 1)
153 _DecodeSignedVarint = _SignedVarintDecoder((1 << 64) - 1)
155 # Use these versions for values which must be limited to 32 bits.
156 _DecodeVarint32 = _VarintDecoder((1 << 32) - 1)
157 _DecodeSignedVarint32 = _SignedVarintDecoder((1 << 32) - 1)
160 def ReadTag(buffer, pos):
161 """Read a tag from the buffer, and return a (tag_bytes, new_pos) tuple.
163 We return the raw bytes of the tag rather than decoding them. The raw
164 bytes can then be used to look up the proper decoder. This effectively allows
165 us to trade some work that would be done in pure-python (decoding a varint)
166 for work that is done in C (searching for a byte string in a hash table).
167 In a low-level language it would be much cheaper to decode the varint and
168 use that, but not in Python.
172 while ord(buffer[pos]) & 0x80:
175 return (buffer[start:pos], pos)
178 # --------------------------------------------------------------------
181 def _SimpleDecoder(wire_type, decode_value):
182 """Return a constructor for a decoder for fields of a particular type.
185 wire_type: The field's wire type.
186 decode_value: A function which decodes an individual value, e.g.
190 def SpecificDecoder(field_number, is_repeated, is_packed, key, new_default):
192 local_DecodeVarint = _DecodeVarint
193 def DecodePackedField(buffer, pos, end, message, field_dict):
194 value = field_dict.get(key)
196 value = field_dict.setdefault(key, new_default(message))
197 (endpoint, pos) = local_DecodeVarint(buffer, pos)
200 raise _DecodeError('Truncated message.')
201 while pos < endpoint:
202 (element, pos) = decode_value(buffer, pos)
203 value.append(element)
205 del value[-1] # Discard corrupt value.
206 raise _DecodeError('Packed element was truncated.')
208 return DecodePackedField
210 tag_bytes = encoder.TagBytes(field_number, wire_type)
211 tag_len = len(tag_bytes)
212 def DecodeRepeatedField(buffer, pos, end, message, field_dict):
213 value = field_dict.get(key)
215 value = field_dict.setdefault(key, new_default(message))
217 (element, new_pos) = decode_value(buffer, pos)
218 value.append(element)
219 # Predict that the next tag is another copy of the same repeated
221 pos = new_pos + tag_len
222 if buffer[new_pos:pos] != tag_bytes or new_pos >= end:
223 # Prediction failed. Return.
225 raise _DecodeError('Truncated message.')
227 return DecodeRepeatedField
229 def DecodeField(buffer, pos, end, message, field_dict):
230 (field_dict[key], pos) = decode_value(buffer, pos)
232 del field_dict[key] # Discard corrupt value.
233 raise _DecodeError('Truncated message.')
237 return SpecificDecoder
240 def _ModifiedDecoder(wire_type, decode_value, modify_value):
241 """Like SimpleDecoder but additionally invokes modify_value on every value
242 before storing it. Usually modify_value is ZigZagDecode.
245 # Reusing _SimpleDecoder is slightly slower than copying a bunch of code, but
246 # not enough to make a significant difference.
248 def InnerDecode(buffer, pos):
249 (result, new_pos) = decode_value(buffer, pos)
250 return (modify_value(result), new_pos)
251 return _SimpleDecoder(wire_type, InnerDecode)
254 def _StructPackDecoder(wire_type, format):
255 """Return a constructor for a decoder for a fixed-width field.
258 wire_type: The field's wire type.
259 format: The format string to pass to struct.unpack().
262 value_size = struct.calcsize(format)
263 local_unpack = struct.unpack
265 # Reusing _SimpleDecoder is slightly slower than copying a bunch of code, but
266 # not enough to make a significant difference.
268 # Note that we expect someone up-stack to catch struct.error and convert
269 # it to _DecodeError -- this way we don't have to set up exception-
270 # handling blocks every time we parse one value.
272 def InnerDecode(buffer, pos):
273 new_pos = pos + value_size
274 result = local_unpack(format, buffer[pos:new_pos])[0]
275 return (result, new_pos)
276 return _SimpleDecoder(wire_type, InnerDecode)
280 """Returns a decoder for a float field.
282 This code works around a bug in struct.unpack for non-finite 32-bit
283 floating-point values.
286 local_unpack = struct.unpack
288 def InnerDecode(buffer, pos):
289 # We expect a 32-bit value in little-endian byte order. Bit 1 is the sign
290 # bit, bits 2-9 represent the exponent, and bits 10-32 are the significand.
292 float_bytes = buffer[pos:new_pos]
294 # If this value has all its exponent bits set, then it's non-finite.
295 # In Python 2.4, struct.unpack will convert it to a finite 64-bit value.
296 # To avoid that, we parse it specially.
297 if ((float_bytes[3] in '\x7F\xFF')
298 and (float_bytes[2] >= '\x80')):
299 # If at least one significand bit is set...
300 if float_bytes[0:3] != '\x00\x00\x80':
301 return (_NAN, new_pos)
302 # If sign bit is set...
303 if float_bytes[3] == '\xFF':
304 return (_NEG_INF, new_pos)
305 return (_POS_INF, new_pos)
307 # Note that we expect someone up-stack to catch struct.error and convert
308 # it to _DecodeError -- this way we don't have to set up exception-
309 # handling blocks every time we parse one value.
310 result = local_unpack('<f', float_bytes)[0]
311 return (result, new_pos)
312 return _SimpleDecoder(wire_format.WIRETYPE_FIXED32, InnerDecode)
315 def _DoubleDecoder():
316 """Returns a decoder for a double field.
318 This code works around a bug in struct.unpack for not-a-number.
321 local_unpack = struct.unpack
323 def InnerDecode(buffer, pos):
324 # We expect a 64-bit value in little-endian byte order. Bit 1 is the sign
325 # bit, bits 2-12 represent the exponent, and bits 13-64 are the significand.
327 double_bytes = buffer[pos:new_pos]
329 # If this value has all its exponent bits set and at least one significand
330 # bit set, it's not a number. In Python 2.4, struct.unpack will treat it
331 # as inf or -inf. To avoid that, we treat it specially.
332 if ((double_bytes[7] in '\x7F\xFF')
333 and (double_bytes[6] >= '\xF0')
334 and (double_bytes[0:7] != '\x00\x00\x00\x00\x00\x00\xF0')):
335 return (_NAN, new_pos)
337 # Note that we expect someone up-stack to catch struct.error and convert
338 # it to _DecodeError -- this way we don't have to set up exception-
339 # handling blocks every time we parse one value.
340 result = local_unpack('<d', double_bytes)[0]
341 return (result, new_pos)
342 return _SimpleDecoder(wire_format.WIRETYPE_FIXED64, InnerDecode)
345 # --------------------------------------------------------------------
348 Int32Decoder = EnumDecoder = _SimpleDecoder(
349 wire_format.WIRETYPE_VARINT, _DecodeSignedVarint32)
351 Int64Decoder = _SimpleDecoder(
352 wire_format.WIRETYPE_VARINT, _DecodeSignedVarint)
354 UInt32Decoder = _SimpleDecoder(wire_format.WIRETYPE_VARINT, _DecodeVarint32)
355 UInt64Decoder = _SimpleDecoder(wire_format.WIRETYPE_VARINT, _DecodeVarint)
357 SInt32Decoder = _ModifiedDecoder(
358 wire_format.WIRETYPE_VARINT, _DecodeVarint32, wire_format.ZigZagDecode)
359 SInt64Decoder = _ModifiedDecoder(
360 wire_format.WIRETYPE_VARINT, _DecodeVarint, wire_format.ZigZagDecode)
362 # Note that Python conveniently guarantees that when using the '<' prefix on
363 # formats, they will also have the same size across all platforms (as opposed
364 # to without the prefix, where their sizes depend on the C compiler's basic
366 Fixed32Decoder = _StructPackDecoder(wire_format.WIRETYPE_FIXED32, '<I')
367 Fixed64Decoder = _StructPackDecoder(wire_format.WIRETYPE_FIXED64, '<Q')
368 SFixed32Decoder = _StructPackDecoder(wire_format.WIRETYPE_FIXED32, '<i')
369 SFixed64Decoder = _StructPackDecoder(wire_format.WIRETYPE_FIXED64, '<q')
370 FloatDecoder = _FloatDecoder()
371 DoubleDecoder = _DoubleDecoder()
373 BoolDecoder = _ModifiedDecoder(
374 wire_format.WIRETYPE_VARINT, _DecodeVarint, bool)
377 def StringDecoder(field_number, is_repeated, is_packed, key, new_default):
378 """Returns a decoder for a string field."""
380 local_DecodeVarint = _DecodeVarint
381 local_unicode = unicode
385 tag_bytes = encoder.TagBytes(field_number,
386 wire_format.WIRETYPE_LENGTH_DELIMITED)
387 tag_len = len(tag_bytes)
388 def DecodeRepeatedField(buffer, pos, end, message, field_dict):
389 value = field_dict.get(key)
391 value = field_dict.setdefault(key, new_default(message))
393 (size, pos) = local_DecodeVarint(buffer, pos)
396 raise _DecodeError('Truncated string.')
397 value.append(local_unicode(buffer[pos:new_pos], 'utf-8'))
398 # Predict that the next tag is another copy of the same repeated field.
399 pos = new_pos + tag_len
400 if buffer[new_pos:pos] != tag_bytes or new_pos == end:
401 # Prediction failed. Return.
403 return DecodeRepeatedField
405 def DecodeField(buffer, pos, end, message, field_dict):
406 (size, pos) = local_DecodeVarint(buffer, pos)
409 raise _DecodeError('Truncated string.')
410 field_dict[key] = local_unicode(buffer[pos:new_pos], 'utf-8')
415 def BytesDecoder(field_number, is_repeated, is_packed, key, new_default):
416 """Returns a decoder for a bytes field."""
418 local_DecodeVarint = _DecodeVarint
422 tag_bytes = encoder.TagBytes(field_number,
423 wire_format.WIRETYPE_LENGTH_DELIMITED)
424 tag_len = len(tag_bytes)
425 def DecodeRepeatedField(buffer, pos, end, message, field_dict):
426 value = field_dict.get(key)
428 value = field_dict.setdefault(key, new_default(message))
430 (size, pos) = local_DecodeVarint(buffer, pos)
433 raise _DecodeError('Truncated string.')
434 value.append(buffer[pos:new_pos])
435 # Predict that the next tag is another copy of the same repeated field.
436 pos = new_pos + tag_len
437 if buffer[new_pos:pos] != tag_bytes or new_pos == end:
438 # Prediction failed. Return.
440 return DecodeRepeatedField
442 def DecodeField(buffer, pos, end, message, field_dict):
443 (size, pos) = local_DecodeVarint(buffer, pos)
446 raise _DecodeError('Truncated string.')
447 field_dict[key] = buffer[pos:new_pos]
452 def GroupDecoder(field_number, is_repeated, is_packed, key, new_default):
453 """Returns a decoder for a group field."""
455 end_tag_bytes = encoder.TagBytes(field_number,
456 wire_format.WIRETYPE_END_GROUP)
457 end_tag_len = len(end_tag_bytes)
461 tag_bytes = encoder.TagBytes(field_number,
462 wire_format.WIRETYPE_START_GROUP)
463 tag_len = len(tag_bytes)
464 def DecodeRepeatedField(buffer, pos, end, message, field_dict):
465 value = field_dict.get(key)
467 value = field_dict.setdefault(key, new_default(message))
469 value = field_dict.get(key)
471 value = field_dict.setdefault(key, new_default(message))
473 pos = value.add()._InternalParse(buffer, pos, end)
475 new_pos = pos+end_tag_len
476 if buffer[pos:new_pos] != end_tag_bytes or new_pos > end:
477 raise _DecodeError('Missing group end tag.')
478 # Predict that the next tag is another copy of the same repeated field.
479 pos = new_pos + tag_len
480 if buffer[new_pos:pos] != tag_bytes or new_pos == end:
481 # Prediction failed. Return.
483 return DecodeRepeatedField
485 def DecodeField(buffer, pos, end, message, field_dict):
486 value = field_dict.get(key)
488 value = field_dict.setdefault(key, new_default(message))
490 pos = value._InternalParse(buffer, pos, end)
492 new_pos = pos+end_tag_len
493 if buffer[pos:new_pos] != end_tag_bytes or new_pos > end:
494 raise _DecodeError('Missing group end tag.')
499 def MessageDecoder(field_number, is_repeated, is_packed, key, new_default):
500 """Returns a decoder for a message field."""
502 local_DecodeVarint = _DecodeVarint
506 tag_bytes = encoder.TagBytes(field_number,
507 wire_format.WIRETYPE_LENGTH_DELIMITED)
508 tag_len = len(tag_bytes)
509 def DecodeRepeatedField(buffer, pos, end, message, field_dict):
510 value = field_dict.get(key)
512 value = field_dict.setdefault(key, new_default(message))
514 value = field_dict.get(key)
516 value = field_dict.setdefault(key, new_default(message))
518 (size, pos) = local_DecodeVarint(buffer, pos)
521 raise _DecodeError('Truncated message.')
523 if value.add()._InternalParse(buffer, pos, new_pos) != new_pos:
524 # The only reason _InternalParse would return early is if it
525 # encountered an end-group tag.
526 raise _DecodeError('Unexpected end-group tag.')
527 # Predict that the next tag is another copy of the same repeated field.
528 pos = new_pos + tag_len
529 if buffer[new_pos:pos] != tag_bytes or new_pos == end:
530 # Prediction failed. Return.
532 return DecodeRepeatedField
534 def DecodeField(buffer, pos, end, message, field_dict):
535 value = field_dict.get(key)
537 value = field_dict.setdefault(key, new_default(message))
539 (size, pos) = local_DecodeVarint(buffer, pos)
542 raise _DecodeError('Truncated message.')
544 if value._InternalParse(buffer, pos, new_pos) != new_pos:
545 # The only reason _InternalParse would return early is if it encountered
547 raise _DecodeError('Unexpected end-group tag.')
552 # --------------------------------------------------------------------
554 MESSAGE_SET_ITEM_TAG = encoder.TagBytes(1, wire_format.WIRETYPE_START_GROUP)
556 def MessageSetItemDecoder(extensions_by_number):
557 """Returns a decoder for a MessageSet item.
559 The parameter is the _extensions_by_number map for the message class.
561 The message set message looks like this:
563 repeated group Item = 1 {
564 required int32 type_id = 2;
565 required string message = 3;
570 type_id_tag_bytes = encoder.TagBytes(2, wire_format.WIRETYPE_VARINT)
571 message_tag_bytes = encoder.TagBytes(3, wire_format.WIRETYPE_LENGTH_DELIMITED)
572 item_end_tag_bytes = encoder.TagBytes(1, wire_format.WIRETYPE_END_GROUP)
574 local_ReadTag = ReadTag
575 local_DecodeVarint = _DecodeVarint
576 local_SkipField = SkipField
578 def DecodeItem(buffer, pos, end, message, field_dict):
579 message_set_item_start = pos
584 # Technically, type_id and message can appear in any order, so we need
585 # a little loop here.
587 (tag_bytes, pos) = local_ReadTag(buffer, pos)
588 if tag_bytes == type_id_tag_bytes:
589 (type_id, pos) = local_DecodeVarint(buffer, pos)
590 elif tag_bytes == message_tag_bytes:
591 (size, message_start) = local_DecodeVarint(buffer, pos)
592 pos = message_end = message_start + size
593 elif tag_bytes == item_end_tag_bytes:
596 pos = SkipField(buffer, pos, end, tag_bytes)
598 raise _DecodeError('Missing group end tag.')
601 raise _DecodeError('Truncated message.')
604 raise _DecodeError('MessageSet item missing type_id.')
605 if message_start == -1:
606 raise _DecodeError('MessageSet item missing message.')
608 extension = extensions_by_number.get(type_id)
609 if extension is not None:
610 value = field_dict.get(extension)
612 value = field_dict.setdefault(
613 extension, extension.message_type._concrete_class())
614 if value._InternalParse(buffer, message_start,message_end) != message_end:
615 # The only reason _InternalParse would return early is if it encountered
617 raise _DecodeError('Unexpected end-group tag.')
619 if not message._unknown_fields:
620 message._unknown_fields = []
621 message._unknown_fields.append((MESSAGE_SET_ITEM_TAG,
622 buffer[message_set_item_start:pos]))
628 # --------------------------------------------------------------------
629 # Optimization is not as heavy here because calls to SkipField() are rare,
630 # except for handling end-group tags.
632 def _SkipVarint(buffer, pos, end):
633 """Skip a varint value. Returns the new position."""
635 while ord(buffer[pos]) & 0x80:
639 raise _DecodeError('Truncated message.')
642 def _SkipFixed64(buffer, pos, end):
643 """Skip a fixed64 value. Returns the new position."""
647 raise _DecodeError('Truncated message.')
650 def _SkipLengthDelimited(buffer, pos, end):
651 """Skip a length-delimited value. Returns the new position."""
653 (size, pos) = _DecodeVarint(buffer, pos)
656 raise _DecodeError('Truncated message.')
659 def _SkipGroup(buffer, pos, end):
660 """Skip sub-group. Returns the new position."""
663 (tag_bytes, pos) = ReadTag(buffer, pos)
664 new_pos = SkipField(buffer, pos, end, tag_bytes)
669 def _EndGroup(buffer, pos, end):
670 """Skipping an END_GROUP tag returns -1 to tell the parent loop to break."""
674 def _SkipFixed32(buffer, pos, end):
675 """Skip a fixed32 value. Returns the new position."""
679 raise _DecodeError('Truncated message.')
682 def _RaiseInvalidWireType(buffer, pos, end):
683 """Skip function for unknown wire types. Raises an exception."""
685 raise _DecodeError('Tag had invalid wire type.')
688 """Constructs the SkipField function."""
690 WIRETYPE_TO_SKIPPER = [
693 _SkipLengthDelimited,
697 _RaiseInvalidWireType,
698 _RaiseInvalidWireType,
701 wiretype_mask = wire_format.TAG_TYPE_MASK
704 def SkipField(buffer, pos, end, tag_bytes):
705 """Skips a field with the specified tag.
707 |pos| should point to the byte immediately after the tag.
710 The new position (after the tag value), or -1 if the tag is an end-group
711 tag (in which case the calling loop should break).
714 # The wire type is always in the first byte since varints are little-endian.
715 wire_type = local_ord(tag_bytes[0]) & wiretype_mask
716 return WIRETYPE_TO_SKIPPER[wire_type](buffer, pos, end)
720 SkipField = _FieldSkipper()