src/third_party/protobuf/java/src/main/java/com/google/protobuf/Internal.java

   1 // Protocol Buffers - Google's data interchange format
   2 // Copyright 2008 Google Inc.  All rights reserved.
   3 // http://code.google.com/p/protobuf/
   4 //
   5 // Redistribution and use in source and binary forms, with or without
   6 // modification, are permitted provided that the following conditions are
   7 // met:
   8 //
   9 //     * Redistributions of source code must retain the above copyright
  10 // notice, this list of conditions and the following disclaimer.
  11 //     * Redistributions in binary form must reproduce the above
  12 // copyright notice, this list of conditions and the following disclaimer
  13 // in the documentation and/or other materials provided with the
  14 // distribution.
  15 //     * Neither the name of Google Inc. nor the names of its
  16 // contributors may be used to endorse or promote products derived from
  17 // this software without specific prior written permission.
  18 //
  19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  20 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  21 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  22 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  23 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  24 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  25 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  26 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  27 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  28 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  29 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  30
  31 package com.google.protobuf;
  32
  33 import java.io.UnsupportedEncodingException;
  34
  35 /**
  36  * The classes contained within are used internally by the Protocol Buffer
  37  * library and generated message implementations. They are public only because
  38  * those generated messages do not reside in the {@code protobuf} package.
  39  * Others should not use this class directly.
  40  *
  41  * @author kenton@google.com (Kenton Varda)
  42  */
  43 public class Internal {
  44   /**
  45    * Helper called by generated code to construct default values for string
  46    * fields.
  47    * <p>
  48    * The protocol compiler does not actually contain a UTF-8 decoder -- it
  49    * just pushes UTF-8-encoded text around without touching it.  The one place
  50    * where this presents a problem is when generating Java string literals.
  51    * Unicode characters in the string literal would normally need to be encoded
  52    * using a Unicode escape sequence, which would require decoding them.
  53    * To get around this, protoc instead embeds the UTF-8 bytes into the
  54    * generated code and leaves it to the runtime library to decode them.
  55    * <p>
  56    * It gets worse, though.  If protoc just generated a byte array, like:
  57    *   new byte[] {0x12, 0x34, 0x56, 0x78}
  58    * Java actually generates *code* which allocates an array and then fills
  59    * in each value.  This is much less efficient than just embedding the bytes
  60    * directly into the bytecode.  To get around this, we need another
  61    * work-around.  String literals are embedded directly, so protoc actually
  62    * generates a string literal corresponding to the bytes.  The easiest way
  63    * to do this is to use the ISO-8859-1 character set, which corresponds to
  64    * the first 256 characters of the Unicode range.  Protoc can then use
  65    * good old CEscape to generate the string.
  66    * <p>
  67    * So we have a string literal which represents a set of bytes which
  68    * represents another string.  This function -- stringDefaultValue --
  69    * converts from the generated string to the string we actually want.  The
  70    * generated code calls this automatically.
  71    */
  72   public static String stringDefaultValue(String bytes) {
  73     try {
  74       return new String(bytes.getBytes("ISO-8859-1"), "UTF-8");
  75     } catch (UnsupportedEncodingException e) {
  76       // This should never happen since all JVMs are required to implement
  77       // both of the above character sets.
  78       throw new IllegalStateException(
  79           "Java VM does not support a standard character set.", e);
  80     }
  81   }
  82
  83   /**
  84    * Helper called by generated code to construct default values for bytes
  85    * fields.
  86    * <p>
  87    * This is a lot like {@link #stringDefaultValue}, but for bytes fields.
  88    * In this case we only need the second of the two hacks -- allowing us to
  89    * embed raw bytes as a string literal with ISO-8859-1 encoding.
  90    */
  91   public static ByteString bytesDefaultValue(String bytes) {
  92     try {
  93       return ByteString.copyFrom(bytes.getBytes("ISO-8859-1"));
  94     } catch (UnsupportedEncodingException e) {
  95       // This should never happen since all JVMs are required to implement
  96       // ISO-8859-1.
  97       throw new IllegalStateException(
  98           "Java VM does not support a standard character set.", e);
  99     }
 100   }
 101
 102   /**
 103    * Helper called by generated code to determine if a byte array is a valid
 104    * UTF-8 encoded string such that the original bytes can be converted to
 105    * a String object and then back to a byte array round tripping the bytes
 106    * without loss.  More precisely, returns {@code true} whenever:
 107    * <pre>   {@code
 108    * Arrays.equals(byteString.toByteArray(),
 109    *     new String(byteString.toByteArray(), "UTF-8").getBytes("UTF-8"))
 110    * }</pre>
 111    *
 112    * <p>This method rejects "overlong" byte sequences, as well as
 113    * 3-byte sequences that would map to a surrogate character, in
 114    * accordance with the restricted definition of UTF-8 introduced in
 115    * Unicode 3.1.  Note that the UTF-8 decoder included in Oracle's
 116    * JDK has been modified to also reject "overlong" byte sequences,
 117    * but currently (2011) still accepts 3-byte surrogate character
 118    * byte sequences.
 119    *
 120    * <p>See the Unicode Standard,</br>
 121    * Table 3-6. <em>UTF-8 Bit Distribution</em>,</br>
 122    * Table 3-7. <em>Well Formed UTF-8 Byte Sequences</em>.
 123    *
 124    * <p>As of 2011-02, this method simply returns the result of {@link
 125    * ByteString#isValidUtf8()}.  Calling that method directly is preferred.
 126    *
 127    * @param byteString the string to check
 128    * @return whether the byte array is round trippable
 129    */
 130   public static boolean isValidUtf8(ByteString byteString) {
 131     return byteString.isValidUtf8();
 132   }
 133
 134   /**
 135    * Interface for an enum value or value descriptor, to be used in FieldSet.
 136    * The lite library stores enum values directly in FieldSets but the full
 137    * library stores EnumValueDescriptors in order to better support reflection.
 138    */
 139   public interface EnumLite {
 140     int getNumber();
 141   }
 142
 143   /**
 144    * Interface for an object which maps integers to {@link EnumLite}s.
 145    * {@link Descriptors.EnumDescriptor} implements this interface by mapping
 146    * numbers to {@link Descriptors.EnumValueDescriptor}s.  Additionally,
 147    * every generated enum type has a static method internalGetValueMap() which
 148    * returns an implementation of this type that maps numbers to enum values.
 149    */
 150   public interface EnumLiteMap<T extends EnumLite> {
 151     T findValueByNumber(int number);
 152   }
 153 }