1 // Protocol Buffers - Google's data interchange format
2 // Copyright 2008 Google Inc. All rights reserved.
3 // http://code.google.com/p/protobuf/
5 // Redistribution and use in source and binary forms, with or without
6 // modification, are permitted provided that the following conditions are
9 // * Redistributions of source code must retain the above copyright
10 // notice, this list of conditions and the following disclaimer.
11 // * Redistributions in binary form must reproduce the above
12 // copyright notice, this list of conditions and the following disclaimer
13 // in the documentation and/or other materials provided with the
15 // * Neither the name of Google Inc. nor the names of its
16 // contributors may be used to endorse or promote products derived from
17 // this software without specific prior written permission.
19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 package com.google.protobuf;
33 import static junit.framework.Assert.*;
35 import java.io.UnsupportedEncodingException;
36 import java.util.ArrayList;
37 import java.util.Arrays;
38 import java.util.List;
39 import java.util.Random;
40 import java.util.logging.Logger;
41 import java.nio.charset.CharsetDecoder;
42 import java.nio.charset.Charset;
43 import java.nio.charset.CodingErrorAction;
44 import java.nio.charset.CharsetEncoder;
45 import java.nio.charset.CoderResult;
46 import java.nio.ByteBuffer;
47 import java.nio.CharBuffer;
50 * Shared testing code for {@link IsValidUtf8Test} and
51 * {@link IsValidUtf8FourByteTest}.
53 * @author jonp@google.com (Jon Perlow)
54 * @author martinrb@google.com (Martin Buchholz)
56 class IsValidUtf8TestUtil {
57 private static Logger logger = Logger.getLogger(
58 IsValidUtf8TestUtil.class.getName());
60 // 128 - [chars 0x0000 to 0x007f]
61 static long ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS = 0x007f - 0x0000 + 1;
64 static long EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT =
65 ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS;
67 // 1920 [chars 0x0080 to 0x07FF]
68 static long TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS = 0x07FF - 0x0080 + 1;
71 static long EXPECTED_TWO_BYTE_ROUNDTRIPPABLE_COUNT =
72 // Both bytes are one byte characters
73 (long) Math.pow(EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT, 2) +
74 // The possible number of two byte characters
75 TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS;
78 static long THREE_BYTE_SURROGATES = 2 * 1024;
80 // 61,440 [chars 0x0800 to 0xFFFF, minus surrogates]
81 static long THREE_BYTE_ROUNDTRIPPABLE_CHARACTERS =
82 0xFFFF - 0x0800 + 1 - THREE_BYTE_SURROGATES;
85 static long EXPECTED_THREE_BYTE_ROUNDTRIPPABLE_COUNT =
86 // All one byte characters
87 (long) Math.pow(EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT, 3) +
88 // One two byte character and a one byte character
89 2 * TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS *
90 ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS +
91 // Three byte characters
92 THREE_BYTE_ROUNDTRIPPABLE_CHARACTERS;
94 // 1,048,576 [chars 0x10000L to 0x10FFFF]
95 static long FOUR_BYTE_ROUNDTRIPPABLE_CHARACTERS = 0x10FFFF - 0x10000L + 1;
98 static long EXPECTED_FOUR_BYTE_ROUNDTRIPPABLE_COUNT =
99 // All one byte characters
100 (long) Math.pow(EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT, 4) +
101 // One and three byte characters
102 2 * THREE_BYTE_ROUNDTRIPPABLE_CHARACTERS *
103 ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS +
104 // Two two byte characters
105 TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS * TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS +
106 // Permutations of one and two byte characters
107 3 * TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS *
108 ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS *
109 ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS +
110 // Four byte characters
111 FOUR_BYTE_ROUNDTRIPPABLE_CHARACTERS;
120 public Shard(long index, long start, long lim, long expected) {
121 assertTrue(start < lim);
125 this.expected = expected;
129 static final long[] FOUR_BYTE_SHARDS_EXPECTED_ROUNTRIPPABLES =
130 generateFourByteShardsExpectedRunnables();
132 private static long[] generateFourByteShardsExpectedRunnables() {
133 long[] expected = new long[128];
135 // 0-63 are all 5300224
136 for (int i = 0; i <= 63; i++) {
137 expected[i] = 5300224;
140 // 97-111 are all 2342912
141 for (int i = 97; i <= 111; i++) {
142 expected[i] = 2342912;
145 // 113-117 are all 1048576
146 for (int i = 113; i <= 117; i++) {
147 expected[i] = 1048576;
151 expected[112] = 786432;
152 expected[118] = 786432;
153 expected[119] = 1048576;
154 expected[120] = 458752;
155 expected[121] = 524288;
156 expected[122] = 65536;
158 // Anything not assigned was the default 0.
162 static final List<Shard> FOUR_BYTE_SHARDS = generateFourByteShards(
163 128, FOUR_BYTE_SHARDS_EXPECTED_ROUNTRIPPABLES);
166 private static List<Shard> generateFourByteShards(
167 int numShards, long[] expected) {
168 assertEquals(numShards, expected.length);
169 List<Shard> shards = new ArrayList<Shard>(numShards);
171 long increment = LIM / numShards;
172 assertTrue(LIM % numShards == 0);
173 for (int i = 0; i < numShards; i++) {
174 shards.add(new Shard(i,
183 * Helper to run the loop to test all the permutations for the number of bytes
186 * @param numBytes the number of bytes in the byte array
187 * @param expectedCount the expected number of roundtrippable permutations
189 static void testBytes(int numBytes, long expectedCount)
190 throws UnsupportedEncodingException {
191 testBytes(numBytes, expectedCount, 0, -1);
195 * Helper to run the loop to test all the permutations for the number of bytes
196 * specified. This overload is useful for debugging to get the loop to start
197 * at a certain character.
199 * @param numBytes the number of bytes in the byte array
200 * @param expectedCount the expected number of roundtrippable permutations
201 * @param start the starting bytes encoded as a long as big-endian
202 * @param lim the limit of bytes to process encoded as a long as big-endian,
203 * or -1 to mean the max limit for numBytes
205 static void testBytes(int numBytes, long expectedCount, long start, long lim)
206 throws UnsupportedEncodingException {
207 Random rnd = new Random();
208 byte[] bytes = new byte[numBytes];
211 lim = 1L << (numBytes * 8);
214 long countRoundTripped = 0;
215 for (long byteChar = start; byteChar < lim; byteChar++) {
216 long tmpByteChar = byteChar;
217 for (int i = 0; i < numBytes; i++) {
218 bytes[bytes.length - i - 1] = (byte) tmpByteChar;
219 tmpByteChar = tmpByteChar >> 8;
221 ByteString bs = ByteString.copyFrom(bytes);
222 boolean isRoundTrippable = bs.isValidUtf8();
223 String s = new String(bytes, "UTF-8");
224 byte[] bytesReencoded = s.getBytes("UTF-8");
225 boolean bytesEqual = Arrays.equals(bytes, bytesReencoded);
227 if (bytesEqual != isRoundTrippable) {
228 outputFailure(byteChar, bytes, bytesReencoded);
231 // Check agreement with static Utf8 methods.
232 assertEquals(isRoundTrippable, Utf8.isValidUtf8(bytes));
233 assertEquals(isRoundTrippable, Utf8.isValidUtf8(bytes, 0, numBytes));
235 // Test partial sequences.
236 // Partition numBytes into three segments (not necessarily non-empty).
237 int i = rnd.nextInt(numBytes);
238 int j = rnd.nextInt(numBytes);
240 int tmp = i; i = j; j = tmp;
242 int state1 = Utf8.partialIsValidUtf8(Utf8.COMPLETE, bytes, 0, i);
243 int state2 = Utf8.partialIsValidUtf8(state1, bytes, i, j);
244 int state3 = Utf8.partialIsValidUtf8(state2, bytes, j, numBytes);
245 if (isRoundTrippable != (state3 == Utf8.COMPLETE)) {
246 System.out.printf("state=%04x %04x %04x i=%d j=%d%n",
247 state1, state2, state3, i, j);
248 outputFailure(byteChar, bytes, bytesReencoded);
250 assertEquals(isRoundTrippable, (state3 == Utf8.COMPLETE));
252 // Test ropes built out of small partial sequences
253 ByteString rope = RopeByteString.newInstanceForTest(
255 RopeByteString.newInstanceForTest(
257 bs.substring(j, numBytes)));
258 assertSame(RopeByteString.class, rope.getClass());
260 ByteString[] byteStrings = { bs, bs.substring(0, numBytes), rope };
261 for (ByteString x : byteStrings) {
262 assertEquals(isRoundTrippable,
265 x.partialIsValidUtf8(Utf8.COMPLETE, 0, numBytes));
268 x.partialIsValidUtf8(Utf8.COMPLETE, 0, i));
270 x.substring(0, i).partialIsValidUtf8(Utf8.COMPLETE, 0, i));
272 x.partialIsValidUtf8(state1, i, j - i));
274 x.substring(i, j).partialIsValidUtf8(state1, 0, j - i));
276 x.partialIsValidUtf8(state2, j, numBytes - j));
278 x.substring(j, numBytes)
279 .partialIsValidUtf8(state2, 0, numBytes - j));
282 // ByteString reduplication should not affect its UTF-8 validity.
283 ByteString ropeADope =
284 RopeByteString.newInstanceForTest(bs, bs.substring(0, numBytes));
285 assertEquals(isRoundTrippable, ropeADope.isValidUtf8());
287 if (isRoundTrippable) {
291 if (byteChar != 0 && byteChar % 1000000L == 0) {
292 logger.info("Processed " + (byteChar / 1000000L) +
293 " million characters");
296 logger.info("Round tripped " + countRoundTripped + " of " + count);
297 assertEquals(expectedCount, countRoundTripped);
301 * Variation of {@link #testBytes} that does less allocation using the
302 * low-level encoders/decoders directly. Checked in because it's useful for
303 * debugging when trying to process bytes faster, but since it doesn't use the
304 * actual String class, it's possible for incompatibilities to develop
305 * (although unlikely).
307 * @param numBytes the number of bytes in the byte array
308 * @param expectedCount the expected number of roundtrippable permutations
309 * @param start the starting bytes encoded as a long as big-endian
310 * @param lim the limit of bytes to process encoded as a long as big-endian,
311 * or -1 to mean the max limit for numBytes
313 void testBytesUsingByteBuffers(
314 int numBytes, long expectedCount, long start, long lim)
315 throws UnsupportedEncodingException {
316 CharsetDecoder decoder = Charset.forName("UTF-8").newDecoder()
317 .onMalformedInput(CodingErrorAction.REPLACE)
318 .onUnmappableCharacter(CodingErrorAction.REPLACE);
319 CharsetEncoder encoder = Charset.forName("UTF-8").newEncoder()
320 .onMalformedInput(CodingErrorAction.REPLACE)
321 .onUnmappableCharacter(CodingErrorAction.REPLACE);
322 byte[] bytes = new byte[numBytes];
323 int maxChars = (int) (decoder.maxCharsPerByte() * numBytes) + 1;
324 char[] charsDecoded =
325 new char[(int) (decoder.maxCharsPerByte() * numBytes) + 1];
326 int maxBytes = (int) (encoder.maxBytesPerChar() * maxChars) + 1;
327 byte[] bytesReencoded = new byte[maxBytes];
329 ByteBuffer bb = ByteBuffer.wrap(bytes);
330 CharBuffer cb = CharBuffer.wrap(charsDecoded);
331 ByteBuffer bbReencoded = ByteBuffer.wrap(bytesReencoded);
333 lim = 1L << (numBytes * 8);
336 long countRoundTripped = 0;
337 for (long byteChar = start; byteChar < lim; byteChar++) {
339 bb.limit(bytes.length);
341 cb.limit(charsDecoded.length);
342 bbReencoded.rewind();
343 bbReencoded.limit(bytesReencoded.length);
346 long tmpByteChar = byteChar;
347 for (int i = 0; i < bytes.length; i++) {
348 bytes[bytes.length - i - 1] = (byte) tmpByteChar;
349 tmpByteChar = tmpByteChar >> 8;
351 boolean isRoundTrippable = ByteString.copyFrom(bytes).isValidUtf8();
352 CoderResult result = decoder.decode(bb, cb, true);
353 assertFalse(result.isError());
354 result = decoder.flush(cb);
355 assertFalse(result.isError());
357 int charLen = cb.position();
360 result = encoder.encode(cb, bbReencoded, true);
361 assertFalse(result.isError());
362 result = encoder.flush(bbReencoded);
363 assertFalse(result.isError());
365 boolean bytesEqual = true;
366 int bytesLen = bbReencoded.position();
367 if (bytesLen != numBytes) {
370 for (int i = 0; i < numBytes; i++) {
371 if (bytes[i] != bytesReencoded[i]) {
377 if (bytesEqual != isRoundTrippable) {
378 outputFailure(byteChar, bytes, bytesReencoded, bytesLen);
382 if (isRoundTrippable) {
385 if (byteChar != 0 && byteChar % 1000000 == 0) {
386 logger.info("Processed " + (byteChar / 1000000) +
387 " million characters");
390 logger.info("Round tripped " + countRoundTripped + " of " + count);
391 assertEquals(expectedCount, countRoundTripped);
394 private static void outputFailure(long byteChar, byte[] bytes, byte[] after) {
395 outputFailure(byteChar, bytes, after, after.length);
398 private static void outputFailure(long byteChar, byte[] bytes, byte[] after,
400 fail("Failure: (" + Long.toHexString(byteChar) + ") " +
401 toHexString(bytes) + " => " + toHexString(after, len));
404 private static String toHexString(byte[] b) {
405 return toHexString(b, b.length);
408 private static String toHexString(byte[] b, int len) {
409 StringBuilder s = new StringBuilder();
411 for (int i = 0; i < len; i++) {
415 s.append(String.format("%02x", b[i] & 0xFF));