1 // Protocol Buffers - Google's data interchange format
2 // Copyright 2008 Google Inc. All rights reserved.
3 // http://code.google.com/p/protobuf/
5 // Redistribution and use in source and binary forms, with or without
6 // modification, are permitted provided that the following conditions are
9 // * Redistributions of source code must retain the above copyright
10 // notice, this list of conditions and the following disclaimer.
11 // * Redistributions in binary form must reproduce the above
12 // copyright notice, this list of conditions and the following disclaimer
13 // in the documentation and/or other materials provided with the
15 // * Neither the name of Google Inc. nor the names of its
16 // contributors may be used to endorse or promote products derived from
17 // this software without specific prior written permission.
19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 package com.google.protobuf;
33 import com.google.protobuf.Descriptors.Descriptor;
34 import com.google.protobuf.Descriptors.FieldDescriptor;
35 import com.google.protobuf.Descriptors.EnumDescriptor;
36 import com.google.protobuf.Descriptors.EnumValueDescriptor;
38 import java.io.IOException;
39 import java.nio.CharBuffer;
40 import java.math.BigInteger;
41 import java.util.ArrayList;
42 import java.util.List;
43 import java.util.Locale;
45 import java.util.regex.Matcher;
46 import java.util.regex.Pattern;
49 * Provide text parsing and formatting support for proto2 instances.
50 * The implementation largely follows google/protobuf/text_format.cc.
52 * @author wenboz@google.com Wenbo Zhu
53 * @author kenton@google.com Kenton Varda
55 public final class TextFormat {
56 private TextFormat() {}
58 private static final Printer DEFAULT_PRINTER = new Printer();
59 private static final Printer SINGLE_LINE_PRINTER =
60 (new Printer()).setSingleLineMode(true);
61 private static final Printer UNICODE_PRINTER =
62 (new Printer()).setEscapeNonAscii(false);
65 * Outputs a textual representation of the Protocol Message supplied into
66 * the parameter output. (This representation is the new version of the
67 * classic "ProtocolPrinter" output from the original Protocol Buffer system)
69 public static void print(final MessageOrBuilder message, final Appendable output)
71 DEFAULT_PRINTER.print(message, new TextGenerator(output));
74 /** Outputs a textual representation of {@code fields} to {@code output}. */
75 public static void print(final UnknownFieldSet fields,
76 final Appendable output)
78 DEFAULT_PRINTER.printUnknownFields(fields, new TextGenerator(output));
82 * Generates a human readable form of this message, useful for debugging and
83 * other purposes, with no newline characters.
85 public static String shortDebugString(final MessageOrBuilder message) {
87 final StringBuilder sb = new StringBuilder();
88 SINGLE_LINE_PRINTER.print(message, new TextGenerator(sb));
89 // Single line mode currently might have an extra space at the end.
90 return sb.toString().trim();
91 } catch (IOException e) {
92 throw new IllegalStateException(e);
97 * Generates a human readable form of the unknown fields, useful for debugging
98 * and other purposes, with no newline characters.
100 public static String shortDebugString(final UnknownFieldSet fields) {
102 final StringBuilder sb = new StringBuilder();
103 SINGLE_LINE_PRINTER.printUnknownFields(fields, new TextGenerator(sb));
104 // Single line mode currently might have an extra space at the end.
105 return sb.toString().trim();
106 } catch (IOException e) {
107 throw new IllegalStateException(e);
112 * Like {@code print()}, but writes directly to a {@code String} and
115 public static String printToString(final MessageOrBuilder message) {
117 final StringBuilder text = new StringBuilder();
118 print(message, text);
119 return text.toString();
120 } catch (IOException e) {
121 throw new IllegalStateException(e);
126 * Like {@code print()}, but writes directly to a {@code String} and
129 public static String printToString(final UnknownFieldSet fields) {
131 final StringBuilder text = new StringBuilder();
133 return text.toString();
134 } catch (IOException e) {
135 throw new IllegalStateException(e);
140 * Same as {@code printToString()}, except that non-ASCII characters
141 * in string type fields are not escaped in backslash+octals.
143 public static String printToUnicodeString(final MessageOrBuilder message) {
145 final StringBuilder text = new StringBuilder();
146 UNICODE_PRINTER.print(message, new TextGenerator(text));
147 return text.toString();
148 } catch (IOException e) {
149 throw new IllegalStateException(e);
154 * Same as {@code printToString()}, except that non-ASCII characters
155 * in string type fields are not escaped in backslash+octals.
157 public static String printToUnicodeString(final UnknownFieldSet fields) {
159 final StringBuilder text = new StringBuilder();
160 UNICODE_PRINTER.printUnknownFields(fields, new TextGenerator(text));
161 return text.toString();
162 } catch (IOException e) {
163 throw new IllegalStateException(e);
167 public static void printField(final FieldDescriptor field,
169 final Appendable output)
171 DEFAULT_PRINTER.printField(field, value, new TextGenerator(output));
174 public static String printFieldToString(final FieldDescriptor field,
175 final Object value) {
177 final StringBuilder text = new StringBuilder();
178 printField(field, value, text);
179 return text.toString();
180 } catch (IOException e) {
181 throw new IllegalStateException(e);
186 * Outputs a textual representation of the value of given field value.
188 * @param field the descriptor of the field
189 * @param value the value of the field
190 * @param output the output to which to append the formatted value
191 * @throws ClassCastException if the value is not appropriate for the
192 * given field descriptor
193 * @throws IOException if there is an exception writing to the output
195 public static void printFieldValue(final FieldDescriptor field,
197 final Appendable output)
199 DEFAULT_PRINTER.printFieldValue(field, value, new TextGenerator(output));
203 * Outputs a textual representation of the value of an unknown field.
205 * @param tag the field's tag number
206 * @param value the value of the field
207 * @param output the output to which to append the formatted value
208 * @throws ClassCastException if the value is not appropriate for the
209 * given field descriptor
210 * @throws IOException if there is an exception writing to the output
212 public static void printUnknownFieldValue(final int tag,
214 final Appendable output)
216 printUnknownFieldValue(tag, value, new TextGenerator(output));
219 private static void printUnknownFieldValue(final int tag,
221 final TextGenerator generator)
223 switch (WireFormat.getTagWireType(tag)) {
224 case WireFormat.WIRETYPE_VARINT:
225 generator.print(unsignedToString((Long) value));
227 case WireFormat.WIRETYPE_FIXED32:
229 String.format((Locale) null, "0x%08x", (Integer) value));
231 case WireFormat.WIRETYPE_FIXED64:
232 generator.print(String.format((Locale) null, "0x%016x", (Long) value));
234 case WireFormat.WIRETYPE_LENGTH_DELIMITED:
235 generator.print("\"");
236 generator.print(escapeBytes((ByteString) value));
237 generator.print("\"");
239 case WireFormat.WIRETYPE_START_GROUP:
240 DEFAULT_PRINTER.printUnknownFields((UnknownFieldSet) value, generator);
243 throw new IllegalArgumentException("Bad tag: " + tag);
247 /** Helper class for converting protobufs to text. */
248 private static final class Printer {
249 /** Whether to omit newlines from the output. */
250 boolean singleLineMode = false;
252 /** Whether to escape non ASCII characters with backslash and octal. */
253 boolean escapeNonAscii = true;
257 /** Setter of singleLineMode */
258 private Printer setSingleLineMode(boolean singleLineMode) {
259 this.singleLineMode = singleLineMode;
263 /** Setter of escapeNonAscii */
264 private Printer setEscapeNonAscii(boolean escapeNonAscii) {
265 this.escapeNonAscii = escapeNonAscii;
269 private void print(final MessageOrBuilder message, final TextGenerator generator)
271 for (Map.Entry<FieldDescriptor, Object> field
272 : message.getAllFields().entrySet()) {
273 printField(field.getKey(), field.getValue(), generator);
275 printUnknownFields(message.getUnknownFields(), generator);
278 private void printField(final FieldDescriptor field, final Object value,
279 final TextGenerator generator) throws IOException {
280 if (field.isRepeated()) {
281 // Repeated field. Print each element.
282 for (Object element : (List<?>) value) {
283 printSingleField(field, element, generator);
286 printSingleField(field, value, generator);
290 private void printSingleField(final FieldDescriptor field,
292 final TextGenerator generator)
294 if (field.isExtension()) {
295 generator.print("[");
296 // We special-case MessageSet elements for compatibility with proto1.
297 if (field.getContainingType().getOptions().getMessageSetWireFormat()
298 && (field.getType() == FieldDescriptor.Type.MESSAGE)
299 && (field.isOptional())
301 && (field.getExtensionScope() == field.getMessageType())) {
302 generator.print(field.getMessageType().getFullName());
304 generator.print(field.getFullName());
306 generator.print("]");
308 if (field.getType() == FieldDescriptor.Type.GROUP) {
309 // Groups must be serialized with their original capitalization.
310 generator.print(field.getMessageType().getName());
312 generator.print(field.getName());
316 if (field.getJavaType() == FieldDescriptor.JavaType.MESSAGE) {
317 if (singleLineMode) {
318 generator.print(" { ");
320 generator.print(" {\n");
324 generator.print(": ");
327 printFieldValue(field, value, generator);
329 if (field.getJavaType() == FieldDescriptor.JavaType.MESSAGE) {
330 if (singleLineMode) {
331 generator.print("} ");
334 generator.print("}\n");
337 if (singleLineMode) {
338 generator.print(" ");
340 generator.print("\n");
345 private void printFieldValue(final FieldDescriptor field,
347 final TextGenerator generator)
349 switch (field.getType()) {
353 generator.print(((Integer) value).toString());
359 generator.print(((Long) value).toString());
363 generator.print(((Boolean) value).toString());
367 generator.print(((Float) value).toString());
371 generator.print(((Double) value).toString());
376 generator.print(unsignedToString((Integer) value));
381 generator.print(unsignedToString((Long) value));
385 generator.print("\"");
386 generator.print(escapeNonAscii ?
387 escapeText((String) value) :
389 generator.print("\"");
393 generator.print("\"");
394 generator.print(escapeBytes((ByteString) value));
395 generator.print("\"");
399 generator.print(((EnumValueDescriptor) value).getName());
404 print((Message) value, generator);
409 private void printUnknownFields(final UnknownFieldSet unknownFields,
410 final TextGenerator generator)
412 for (Map.Entry<Integer, UnknownFieldSet.Field> entry :
413 unknownFields.asMap().entrySet()) {
414 final int number = entry.getKey();
415 final UnknownFieldSet.Field field = entry.getValue();
416 printUnknownField(number, WireFormat.WIRETYPE_VARINT,
417 field.getVarintList(), generator);
418 printUnknownField(number, WireFormat.WIRETYPE_FIXED32,
419 field.getFixed32List(), generator);
420 printUnknownField(number, WireFormat.WIRETYPE_FIXED64,
421 field.getFixed64List(), generator);
422 printUnknownField(number, WireFormat.WIRETYPE_LENGTH_DELIMITED,
423 field.getLengthDelimitedList(), generator);
424 for (final UnknownFieldSet value : field.getGroupList()) {
425 generator.print(entry.getKey().toString());
426 if (singleLineMode) {
427 generator.print(" { ");
429 generator.print(" {\n");
432 printUnknownFields(value, generator);
433 if (singleLineMode) {
434 generator.print("} ");
437 generator.print("}\n");
443 private void printUnknownField(final int number,
445 final List<?> values,
446 final TextGenerator generator)
448 for (final Object value : values) {
449 generator.print(String.valueOf(number));
450 generator.print(": ");
451 printUnknownFieldValue(wireType, value, generator);
452 generator.print(singleLineMode ? " " : "\n");
457 /** Convert an unsigned 32-bit integer to a string. */
458 private static String unsignedToString(final int value) {
460 return Integer.toString(value);
462 return Long.toString(((long) value) & 0x00000000FFFFFFFFL);
466 /** Convert an unsigned 64-bit integer to a string. */
467 private static String unsignedToString(final long value) {
469 return Long.toString(value);
471 // Pull off the most-significant bit so that BigInteger doesn't think
472 // the number is negative, then set it again using setBit().
473 return BigInteger.valueOf(value & 0x7FFFFFFFFFFFFFFFL)
474 .setBit(63).toString();
479 * An inner class for writing text to the output stream.
481 private static final class TextGenerator {
482 private final Appendable output;
483 private final StringBuilder indent = new StringBuilder();
484 private boolean atStartOfLine = true;
486 private TextGenerator(final Appendable output) {
487 this.output = output;
491 * Indent text by two spaces. After calling Indent(), two spaces will be
492 * inserted at the beginning of each line of text. Indent() may be called
493 * multiple times to produce deeper indents.
495 public void indent() {
500 * Reduces the current indent level by two spaces, or crashes if the indent
503 public void outdent() {
504 final int length = indent.length();
506 throw new IllegalArgumentException(
507 " Outdent() without matching Indent().");
509 indent.delete(length - 2, length);
513 * Print text to the output stream.
515 public void print(final CharSequence text) throws IOException {
516 final int size = text.length();
519 for (int i = 0; i < size; i++) {
520 if (text.charAt(i) == '\n') {
521 write(text.subSequence(pos, size), i - pos + 1);
523 atStartOfLine = true;
526 write(text.subSequence(pos, size), size - pos);
529 private void write(final CharSequence data, final int size)
535 atStartOfLine = false;
536 output.append(indent);
542 // =================================================================
546 * Represents a stream of tokens parsed from a {@code String}.
548 * <p>The Java standard library provides many classes that you might think
549 * would be useful for implementing this, but aren't. For example:
552 * <li>{@code java.io.StreamTokenizer}: This almost does what we want -- or,
553 * at least, something that would get us close to what we want -- except
554 * for one fatal flaw: It automatically un-escapes strings using Java
555 * escape sequences, which do not include all the escape sequences we
556 * need to support (e.g. '\x').
557 * <li>{@code java.util.Scanner}: This seems like a great way at least to
558 * parse regular expressions out of a stream (so we wouldn't have to load
559 * the entire input into a single string before parsing). Sadly,
560 * {@code Scanner} requires that tokens be delimited with some delimiter.
561 * Thus, although the text "foo:" should parse to two tokens ("foo" and
562 * ":"), {@code Scanner} would recognize it only as a single token.
563 * Furthermore, {@code Scanner} provides no way to inspect the contents
564 * of delimiters, making it impossible to keep track of line and column
568 * <p>Luckily, Java's regular expression support does manage to be useful to
569 * us. (Barely: We need {@code Matcher.usePattern()}, which is new in
570 * Java 1.5.) So, we can use that, at least. Unfortunately, this implies
571 * that we need to have the entire input in one contiguous string.
573 private static final class Tokenizer {
574 private final CharSequence text;
575 private final Matcher matcher;
576 private String currentToken;
578 // The character index within this.text at which the current token begins.
581 // The line and column numbers of the current token.
582 private int line = 0;
583 private int column = 0;
585 // The line and column numbers of the previous token (allows throwing
586 // errors *after* consuming).
587 private int previousLine = 0;
588 private int previousColumn = 0;
590 // We use possessive quantifiers (*+ and ++) because otherwise the Java
591 // regex matcher has stack overflows on large inputs.
592 private static final Pattern WHITESPACE =
593 Pattern.compile("(\\s|(#.*$))++", Pattern.MULTILINE);
594 private static final Pattern TOKEN = Pattern.compile(
595 "[a-zA-Z_][0-9a-zA-Z_+-]*+|" + // an identifier
596 "[.]?[0-9+-][0-9a-zA-Z_.+-]*+|" + // a number
597 "\"([^\"\n\\\\]|\\\\.)*+(\"|\\\\?$)|" + // a double-quoted string
598 "\'([^\'\n\\\\]|\\\\.)*+(\'|\\\\?$)", // a single-quoted string
601 private static final Pattern DOUBLE_INFINITY = Pattern.compile(
603 Pattern.CASE_INSENSITIVE);
604 private static final Pattern FLOAT_INFINITY = Pattern.compile(
606 Pattern.CASE_INSENSITIVE);
607 private static final Pattern FLOAT_NAN = Pattern.compile(
609 Pattern.CASE_INSENSITIVE);
611 /** Construct a tokenizer that parses tokens from the given text. */
612 private Tokenizer(final CharSequence text) {
614 this.matcher = WHITESPACE.matcher(text);
619 /** Are we at the end of the input? */
620 public boolean atEnd() {
621 return currentToken.length() == 0;
624 /** Advance to the next token. */
625 public void nextToken() {
627 previousColumn = column;
629 // Advance the line counter to the current position.
630 while (pos < matcher.regionStart()) {
631 if (text.charAt(pos) == '\n') {
640 // Match the next token.
641 if (matcher.regionStart() == matcher.regionEnd()) {
645 matcher.usePattern(TOKEN);
646 if (matcher.lookingAt()) {
647 currentToken = matcher.group();
648 matcher.region(matcher.end(), matcher.regionEnd());
650 // Take one character.
651 currentToken = String.valueOf(text.charAt(pos));
652 matcher.region(pos + 1, matcher.regionEnd());
660 * Skip over any whitespace so that the matcher region starts at the next
663 private void skipWhitespace() {
664 matcher.usePattern(WHITESPACE);
665 if (matcher.lookingAt()) {
666 matcher.region(matcher.end(), matcher.regionEnd());
671 * If the next token exactly matches {@code token}, consume it and return
672 * {@code true}. Otherwise, return {@code false} without doing anything.
674 public boolean tryConsume(final String token) {
675 if (currentToken.equals(token)) {
684 * If the next token exactly matches {@code token}, consume it. Otherwise,
685 * throw a {@link ParseException}.
687 public void consume(final String token) throws ParseException {
688 if (!tryConsume(token)) {
689 throw parseException("Expected \"" + token + "\".");
694 * Returns {@code true} if the next token is an integer, but does
697 public boolean lookingAtInteger() {
698 if (currentToken.length() == 0) {
702 final char c = currentToken.charAt(0);
703 return ('0' <= c && c <= '9') ||
704 c == '-' || c == '+';
708 * If the next token is an identifier, consume it and return its value.
709 * Otherwise, throw a {@link ParseException}.
711 public String consumeIdentifier() throws ParseException {
712 for (int i = 0; i < currentToken.length(); i++) {
713 final char c = currentToken.charAt(i);
714 if (('a' <= c && c <= 'z') ||
715 ('A' <= c && c <= 'Z') ||
716 ('0' <= c && c <= '9') ||
717 (c == '_') || (c == '.')) {
720 throw parseException("Expected identifier.");
724 final String result = currentToken;
730 * If the next token is a 32-bit signed integer, consume it and return its
731 * value. Otherwise, throw a {@link ParseException}.
733 public int consumeInt32() throws ParseException {
735 final int result = parseInt32(currentToken);
738 } catch (NumberFormatException e) {
739 throw integerParseException(e);
744 * If the next token is a 32-bit unsigned integer, consume it and return its
745 * value. Otherwise, throw a {@link ParseException}.
747 public int consumeUInt32() throws ParseException {
749 final int result = parseUInt32(currentToken);
752 } catch (NumberFormatException e) {
753 throw integerParseException(e);
758 * If the next token is a 64-bit signed integer, consume it and return its
759 * value. Otherwise, throw a {@link ParseException}.
761 public long consumeInt64() throws ParseException {
763 final long result = parseInt64(currentToken);
766 } catch (NumberFormatException e) {
767 throw integerParseException(e);
772 * If the next token is a 64-bit unsigned integer, consume it and return its
773 * value. Otherwise, throw a {@link ParseException}.
775 public long consumeUInt64() throws ParseException {
777 final long result = parseUInt64(currentToken);
780 } catch (NumberFormatException e) {
781 throw integerParseException(e);
786 * If the next token is a double, consume it and return its value.
787 * Otherwise, throw a {@link ParseException}.
789 public double consumeDouble() throws ParseException {
790 // We need to parse infinity and nan separately because
791 // Double.parseDouble() does not accept "inf", "infinity", or "nan".
792 if (DOUBLE_INFINITY.matcher(currentToken).matches()) {
793 final boolean negative = currentToken.startsWith("-");
795 return negative ? Double.NEGATIVE_INFINITY : Double.POSITIVE_INFINITY;
797 if (currentToken.equalsIgnoreCase("nan")) {
802 final double result = Double.parseDouble(currentToken);
805 } catch (NumberFormatException e) {
806 throw floatParseException(e);
811 * If the next token is a float, consume it and return its value.
812 * Otherwise, throw a {@link ParseException}.
814 public float consumeFloat() throws ParseException {
815 // We need to parse infinity and nan separately because
816 // Float.parseFloat() does not accept "inf", "infinity", or "nan".
817 if (FLOAT_INFINITY.matcher(currentToken).matches()) {
818 final boolean negative = currentToken.startsWith("-");
820 return negative ? Float.NEGATIVE_INFINITY : Float.POSITIVE_INFINITY;
822 if (FLOAT_NAN.matcher(currentToken).matches()) {
827 final float result = Float.parseFloat(currentToken);
830 } catch (NumberFormatException e) {
831 throw floatParseException(e);
836 * If the next token is a boolean, consume it and return its value.
837 * Otherwise, throw a {@link ParseException}.
839 public boolean consumeBoolean() throws ParseException {
840 if (currentToken.equals("true") ||
841 currentToken.equals("t") ||
842 currentToken.equals("1")) {
845 } else if (currentToken.equals("false") ||
846 currentToken.equals("f") ||
847 currentToken.equals("0")) {
851 throw parseException("Expected \"true\" or \"false\".");
856 * If the next token is a string, consume it and return its (unescaped)
857 * value. Otherwise, throw a {@link ParseException}.
859 public String consumeString() throws ParseException {
860 return consumeByteString().toStringUtf8();
864 * If the next token is a string, consume it, unescape it as a
865 * {@link ByteString}, and return it. Otherwise, throw a
866 * {@link ParseException}.
868 public ByteString consumeByteString() throws ParseException {
869 List<ByteString> list = new ArrayList<ByteString>();
870 consumeByteString(list);
871 while (currentToken.startsWith("'") || currentToken.startsWith("\"")) {
872 consumeByteString(list);
874 return ByteString.copyFrom(list);
878 * Like {@link #consumeByteString()} but adds each token of the string to
879 * the given list. String literals (whether bytes or text) may come in
880 * multiple adjacent tokens which are automatically concatenated, like in
883 private void consumeByteString(List<ByteString> list) throws ParseException {
884 final char quote = currentToken.length() > 0 ? currentToken.charAt(0)
886 if (quote != '\"' && quote != '\'') {
887 throw parseException("Expected string.");
890 if (currentToken.length() < 2 ||
891 currentToken.charAt(currentToken.length() - 1) != quote) {
892 throw parseException("String missing ending quote.");
896 final String escaped =
897 currentToken.substring(1, currentToken.length() - 1);
898 final ByteString result = unescapeBytes(escaped);
901 } catch (InvalidEscapeSequenceException e) {
902 throw parseException(e.getMessage());
907 * Returns a {@link ParseException} with the current line and column
908 * numbers in the description, suitable for throwing.
910 public ParseException parseException(final String description) {
911 // Note: People generally prefer one-based line and column numbers.
912 return new ParseException(
913 line + 1, column + 1, description);
917 * Returns a {@link ParseException} with the line and column numbers of
918 * the previous token in the description, suitable for throwing.
920 public ParseException parseExceptionPreviousToken(
921 final String description) {
922 // Note: People generally prefer one-based line and column numbers.
923 return new ParseException(
924 previousLine + 1, previousColumn + 1, description);
928 * Constructs an appropriate {@link ParseException} for the given
929 * {@code NumberFormatException} when trying to parse an integer.
931 private ParseException integerParseException(
932 final NumberFormatException e) {
933 return parseException("Couldn't parse integer: " + e.getMessage());
937 * Constructs an appropriate {@link ParseException} for the given
938 * {@code NumberFormatException} when trying to parse a float or double.
940 private ParseException floatParseException(final NumberFormatException e) {
941 return parseException("Couldn't parse number: " + e.getMessage());
945 /** Thrown when parsing an invalid text format message. */
946 public static class ParseException extends IOException {
947 private static final long serialVersionUID = 3196188060225107702L;
949 private final int line;
950 private final int column;
952 /** Create a new instance, with -1 as the line and column numbers. */
953 public ParseException(final String message) {
954 this(-1, -1, message);
958 * Create a new instance
960 * @param line the line number where the parse error occurred,
962 * @param column the column number where the parser error occurred,
965 public ParseException(final int line, final int column,
966 final String message) {
967 super(Integer.toString(line) + ":" + column + ": " + message);
969 this.column = column;
973 * Return the line where the parse exception occurred, or -1 when
974 * none is provided. The value is specified as 1-offset, so the first
977 public int getLine() {
982 * Return the column where the parse exception occurred, or -1 when
983 * none is provided. The value is specified as 1-offset, so the first
986 public int getColumn() {
992 * Parse a text-format message from {@code input} and merge the contents
993 * into {@code builder}.
995 public static void merge(final Readable input,
996 final Message.Builder builder)
998 merge(input, ExtensionRegistry.getEmptyRegistry(), builder);
1002 * Parse a text-format message from {@code input} and merge the contents
1003 * into {@code builder}.
1005 public static void merge(final CharSequence input,
1006 final Message.Builder builder)
1007 throws ParseException {
1008 merge(input, ExtensionRegistry.getEmptyRegistry(), builder);
1012 * Parse a text-format message from {@code input} and merge the contents
1013 * into {@code builder}. Extensions will be recognized if they are
1014 * registered in {@code extensionRegistry}.
1016 public static void merge(final Readable input,
1017 final ExtensionRegistry extensionRegistry,
1018 final Message.Builder builder)
1019 throws IOException {
1020 // Read the entire input to a String then parse that.
1022 // If StreamTokenizer were not quite so crippled, or if there were a kind
1023 // of Reader that could read in chunks that match some particular regex,
1024 // or if we wanted to write a custom Reader to tokenize our stream, then
1025 // we would not have to read to one big String. Alas, none of these is
1026 // the case. Oh well.
1028 merge(toStringBuilder(input), extensionRegistry, builder);
1031 private static final int BUFFER_SIZE = 4096;
1033 // TODO(chrisn): See if working around java.io.Reader#read(CharBuffer)
1034 // overhead is worthwhile
1035 private static StringBuilder toStringBuilder(final Readable input)
1036 throws IOException {
1037 final StringBuilder text = new StringBuilder();
1038 final CharBuffer buffer = CharBuffer.allocate(BUFFER_SIZE);
1040 final int n = input.read(buffer);
1045 text.append(buffer, 0, n);
1051 * Parse a text-format message from {@code input} and merge the contents
1052 * into {@code builder}. Extensions will be recognized if they are
1053 * registered in {@code extensionRegistry}.
1055 public static void merge(final CharSequence input,
1056 final ExtensionRegistry extensionRegistry,
1057 final Message.Builder builder)
1058 throws ParseException {
1059 final Tokenizer tokenizer = new Tokenizer(input);
1061 while (!tokenizer.atEnd()) {
1062 mergeField(tokenizer, extensionRegistry, builder);
1067 * Parse a single field from {@code tokenizer} and merge it into
1070 private static void mergeField(final Tokenizer tokenizer,
1071 final ExtensionRegistry extensionRegistry,
1072 final Message.Builder builder)
1073 throws ParseException {
1074 FieldDescriptor field;
1075 final Descriptor type = builder.getDescriptorForType();
1076 ExtensionRegistry.ExtensionInfo extension = null;
1078 if (tokenizer.tryConsume("[")) {
1080 final StringBuilder name =
1081 new StringBuilder(tokenizer.consumeIdentifier());
1082 while (tokenizer.tryConsume(".")) {
1084 name.append(tokenizer.consumeIdentifier());
1087 extension = extensionRegistry.findExtensionByName(name.toString());
1089 if (extension == null) {
1090 throw tokenizer.parseExceptionPreviousToken(
1091 "Extension \"" + name + "\" not found in the ExtensionRegistry.");
1092 } else if (extension.descriptor.getContainingType() != type) {
1093 throw tokenizer.parseExceptionPreviousToken(
1094 "Extension \"" + name + "\" does not extend message type \"" +
1095 type.getFullName() + "\".");
1098 tokenizer.consume("]");
1100 field = extension.descriptor;
1102 final String name = tokenizer.consumeIdentifier();
1103 field = type.findFieldByName(name);
1105 // Group names are expected to be capitalized as they appear in the
1106 // .proto file, which actually matches their type names, not their field
1108 if (field == null) {
1109 // Explicitly specify US locale so that this code does not break when
1110 // executing in Turkey.
1111 final String lowerName = name.toLowerCase(Locale.US);
1112 field = type.findFieldByName(lowerName);
1113 // If the case-insensitive match worked but the field is NOT a group,
1114 if (field != null && field.getType() != FieldDescriptor.Type.GROUP) {
1118 // Again, special-case group names as described above.
1119 if (field != null && field.getType() == FieldDescriptor.Type.GROUP &&
1120 !field.getMessageType().getName().equals(name)) {
1124 if (field == null) {
1125 throw tokenizer.parseExceptionPreviousToken(
1126 "Message type \"" + type.getFullName() +
1127 "\" has no field named \"" + name + "\".");
1131 Object value = null;
1133 if (field.getJavaType() == FieldDescriptor.JavaType.MESSAGE) {
1134 tokenizer.tryConsume(":"); // optional
1136 final String endToken;
1137 if (tokenizer.tryConsume("<")) {
1140 tokenizer.consume("{");
1144 final Message.Builder subBuilder;
1145 if (extension == null) {
1146 subBuilder = builder.newBuilderForField(field);
1148 subBuilder = extension.defaultInstance.newBuilderForType();
1151 while (!tokenizer.tryConsume(endToken)) {
1152 if (tokenizer.atEnd()) {
1153 throw tokenizer.parseException(
1154 "Expected \"" + endToken + "\".");
1156 mergeField(tokenizer, extensionRegistry, subBuilder);
1159 value = subBuilder.buildPartial();
1162 tokenizer.consume(":");
1164 switch (field.getType()) {
1168 value = tokenizer.consumeInt32();
1174 value = tokenizer.consumeInt64();
1179 value = tokenizer.consumeUInt32();
1184 value = tokenizer.consumeUInt64();
1188 value = tokenizer.consumeFloat();
1192 value = tokenizer.consumeDouble();
1196 value = tokenizer.consumeBoolean();
1200 value = tokenizer.consumeString();
1204 value = tokenizer.consumeByteString();
1208 final EnumDescriptor enumType = field.getEnumType();
1210 if (tokenizer.lookingAtInteger()) {
1211 final int number = tokenizer.consumeInt32();
1212 value = enumType.findValueByNumber(number);
1213 if (value == null) {
1214 throw tokenizer.parseExceptionPreviousToken(
1215 "Enum type \"" + enumType.getFullName() +
1216 "\" has no value with number " + number + '.');
1219 final String id = tokenizer.consumeIdentifier();
1220 value = enumType.findValueByName(id);
1221 if (value == null) {
1222 throw tokenizer.parseExceptionPreviousToken(
1223 "Enum type \"" + enumType.getFullName() +
1224 "\" has no value named \"" + id + "\".");
1232 throw new RuntimeException("Can't get here.");
1236 if (field.isRepeated()) {
1237 builder.addRepeatedField(field, value);
1239 builder.setField(field, value);
1243 // =================================================================
1244 // Utility functions
1246 // Some of these methods are package-private because Descriptors.java uses
1250 * Escapes bytes in the format used in protocol buffer text format, which
1251 * is the same as the format used for C string literals. All bytes
1252 * that are not printable 7-bit ASCII characters are escaped, as well as
1253 * backslash, single-quote, and double-quote characters. Characters for
1254 * which no defined short-hand escape sequence is defined will be escaped
1255 * using 3-digit octal sequences.
1257 static String escapeBytes(final ByteString input) {
1258 final StringBuilder builder = new StringBuilder(input.size());
1259 for (int i = 0; i < input.size(); i++) {
1260 final byte b = input.byteAt(i);
1262 // Java does not recognize \a or \v, apparently.
1263 case 0x07: builder.append("\\a" ); break;
1264 case '\b': builder.append("\\b" ); break;
1265 case '\f': builder.append("\\f" ); break;
1266 case '\n': builder.append("\\n" ); break;
1267 case '\r': builder.append("\\r" ); break;
1268 case '\t': builder.append("\\t" ); break;
1269 case 0x0b: builder.append("\\v" ); break;
1270 case '\\': builder.append("\\\\"); break;
1271 case '\'': builder.append("\\\'"); break;
1272 case '"' : builder.append("\\\""); break;
1274 // Note: Bytes with the high-order bit set should be escaped. Since
1275 // bytes are signed, such bytes will compare less than 0x20, hence
1276 // the following line is correct.
1278 builder.append((char) b);
1280 builder.append('\\');
1281 builder.append((char) ('0' + ((b >>> 6) & 3)));
1282 builder.append((char) ('0' + ((b >>> 3) & 7)));
1283 builder.append((char) ('0' + (b & 7)));
1288 return builder.toString();
1292 * Un-escape a byte sequence as escaped using
1293 * {@link #escapeBytes(ByteString)}. Two-digit hex escapes (starting with
1294 * "\x") are also recognized.
1296 static ByteString unescapeBytes(final CharSequence charString)
1297 throws InvalidEscapeSequenceException {
1298 // First convert the Java character sequence to UTF-8 bytes.
1299 ByteString input = ByteString.copyFromUtf8(charString.toString());
1300 // Then unescape certain byte sequences introduced by ASCII '\\'. The valid
1301 // escapes can all be expressed with ASCII characters, so it is safe to
1302 // operate on bytes here.
1304 // Unescaping the input byte array will result in a byte sequence that's no
1305 // longer than the input. That's because each escape sequence is between
1306 // two and four bytes long and stands for a single byte.
1307 final byte[] result = new byte[input.size()];
1309 for (int i = 0; i < input.size(); i++) {
1310 byte c = input.byteAt(i);
1312 if (i + 1 < input.size()) {
1314 c = input.byteAt(i);
1317 int code = digitValue(c);
1318 if (i + 1 < input.size() && isOctal(input.byteAt(i + 1))) {
1320 code = code * 8 + digitValue(input.byteAt(i));
1322 if (i + 1 < input.size() && isOctal(input.byteAt(i + 1))) {
1324 code = code * 8 + digitValue(input.byteAt(i));
1326 // TODO: Check that 0 <= code && code <= 0xFF.
1327 result[pos++] = (byte)code;
1330 case 'a' : result[pos++] = 0x07; break;
1331 case 'b' : result[pos++] = '\b'; break;
1332 case 'f' : result[pos++] = '\f'; break;
1333 case 'n' : result[pos++] = '\n'; break;
1334 case 'r' : result[pos++] = '\r'; break;
1335 case 't' : result[pos++] = '\t'; break;
1336 case 'v' : result[pos++] = 0x0b; break;
1337 case '\\': result[pos++] = '\\'; break;
1338 case '\'': result[pos++] = '\''; break;
1339 case '"' : result[pos++] = '\"'; break;
1344 if (i + 1 < input.size() && isHex(input.byteAt(i + 1))) {
1346 code = digitValue(input.byteAt(i));
1348 throw new InvalidEscapeSequenceException(
1349 "Invalid escape sequence: '\\x' with no digits");
1351 if (i + 1 < input.size() && isHex(input.byteAt(i + 1))) {
1353 code = code * 16 + digitValue(input.byteAt(i));
1355 result[pos++] = (byte)code;
1359 throw new InvalidEscapeSequenceException(
1360 "Invalid escape sequence: '\\" + (char)c + '\'');
1364 throw new InvalidEscapeSequenceException(
1365 "Invalid escape sequence: '\\' at end of string.");
1372 return ByteString.copyFrom(result, 0, pos);
1376 * Thrown by {@link TextFormat#unescapeBytes} and
1377 * {@link TextFormat#unescapeText} when an invalid escape sequence is seen.
1379 static class InvalidEscapeSequenceException extends IOException {
1380 private static final long serialVersionUID = -8164033650142593304L;
1382 InvalidEscapeSequenceException(final String description) {
1388 * Like {@link #escapeBytes(ByteString)}, but escapes a text string.
1389 * Non-ASCII characters are first encoded as UTF-8, then each byte is escaped
1390 * individually as a 3-digit octal escape. Yes, it's weird.
1392 static String escapeText(final String input) {
1393 return escapeBytes(ByteString.copyFromUtf8(input));
1397 * Un-escape a text string as escaped using {@link #escapeText(String)}.
1398 * Two-digit hex escapes (starting with "\x") are also recognized.
1400 static String unescapeText(final String input)
1401 throws InvalidEscapeSequenceException {
1402 return unescapeBytes(input).toStringUtf8();
1405 /** Is this an octal digit? */
1406 private static boolean isOctal(final byte c) {
1407 return '0' <= c && c <= '7';
1410 /** Is this a hex digit? */
1411 private static boolean isHex(final byte c) {
1412 return ('0' <= c && c <= '9') ||
1413 ('a' <= c && c <= 'f') ||
1414 ('A' <= c && c <= 'F');
1418 * Interpret a character as a digit (in any base up to 36) and return the
1419 * numeric value. This is like {@code Character.digit()} but we don't accept
1422 private static int digitValue(final byte c) {
1423 if ('0' <= c && c <= '9') {
1425 } else if ('a' <= c && c <= 'z') {
1426 return c - 'a' + 10;
1428 return c - 'A' + 10;
1433 * Parse a 32-bit signed integer from the text. Unlike the Java standard
1434 * {@code Integer.parseInt()}, this function recognizes the prefixes "0x"
1435 * and "0" to signify hexadecimal and octal numbers, respectively.
1437 static int parseInt32(final String text) throws NumberFormatException {
1438 return (int) parseInteger(text, true, false);
1442 * Parse a 32-bit unsigned integer from the text. Unlike the Java standard
1443 * {@code Integer.parseInt()}, this function recognizes the prefixes "0x"
1444 * and "0" to signify hexadecimal and octal numbers, respectively. The
1445 * result is coerced to a (signed) {@code int} when returned since Java has
1446 * no unsigned integer type.
1448 static int parseUInt32(final String text) throws NumberFormatException {
1449 return (int) parseInteger(text, false, false);
1453 * Parse a 64-bit signed integer from the text. Unlike the Java standard
1454 * {@code Integer.parseInt()}, this function recognizes the prefixes "0x"
1455 * and "0" to signify hexadecimal and octal numbers, respectively.
1457 static long parseInt64(final String text) throws NumberFormatException {
1458 return parseInteger(text, true, true);
1462 * Parse a 64-bit unsigned integer from the text. Unlike the Java standard
1463 * {@code Integer.parseInt()}, this function recognizes the prefixes "0x"
1464 * and "0" to signify hexadecimal and octal numbers, respectively. The
1465 * result is coerced to a (signed) {@code long} when returned since Java has
1466 * no unsigned long type.
1468 static long parseUInt64(final String text) throws NumberFormatException {
1469 return parseInteger(text, false, true);
1472 private static long parseInteger(final String text,
1473 final boolean isSigned,
1474 final boolean isLong)
1475 throws NumberFormatException {
1478 boolean negative = false;
1479 if (text.startsWith("-", pos)) {
1481 throw new NumberFormatException("Number must be positive: " + text);
1488 if (text.startsWith("0x", pos)) {
1491 } else if (text.startsWith("0", pos)) {
1495 final String numberText = text.substring(pos);
1498 if (numberText.length() < 16) {
1499 // Can safely assume no overflow.
1500 result = Long.parseLong(numberText, radix);
1506 // No need to check for 64-bit numbers since they'd have to be 16 chars
1507 // or longer to overflow.
1510 if (result > Integer.MAX_VALUE || result < Integer.MIN_VALUE) {
1511 throw new NumberFormatException(
1512 "Number out of range for 32-bit signed integer: " + text);
1515 if (result >= (1L << 32) || result < 0) {
1516 throw new NumberFormatException(
1517 "Number out of range for 32-bit unsigned integer: " + text);
1522 BigInteger bigValue = new BigInteger(numberText, radix);
1524 bigValue = bigValue.negate();
1530 if (bigValue.bitLength() > 31) {
1531 throw new NumberFormatException(
1532 "Number out of range for 32-bit signed integer: " + text);
1535 if (bigValue.bitLength() > 32) {
1536 throw new NumberFormatException(
1537 "Number out of range for 32-bit unsigned integer: " + text);
1542 if (bigValue.bitLength() > 63) {
1543 throw new NumberFormatException(
1544 "Number out of range for 64-bit signed integer: " + text);
1547 if (bigValue.bitLength() > 64) {
1548 throw new NumberFormatException(
1549 "Number out of range for 64-bit unsigned integer: " + text);
1554 result = bigValue.longValue();