1 // Copyright 2014 The Chromium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "base/i18n/streaming_utf8_validator.h"
14 #include "base/functional/bind.h"
15 #include "base/location.h"
16 #include "base/logging.h"
17 #include "base/memory/ref_counted.h"
18 #include "base/strings/string_piece.h"
19 #include "base/strings/string_util.h"
20 #include "base/strings/stringprintf.h"
21 #include "base/strings/utf_string_conversion_utils.h"
22 #include "base/synchronization/lock.h"
23 #include "base/task/thread_pool.h"
24 #include "base/test/task_environment.h"
25 #include "testing/gtest/include/gtest/gtest.h"
26 #include "third_party/icu/source/common/unicode/utf8.h"
31 // Avoid having to qualify the enum values in the tests.
32 const StreamingUtf8Validator::State VALID_ENDPOINT =
33 StreamingUtf8Validator::VALID_ENDPOINT;
34 const StreamingUtf8Validator::State VALID_MIDPOINT =
35 StreamingUtf8Validator::VALID_MIDPOINT;
36 const StreamingUtf8Validator::State INVALID = StreamingUtf8Validator::INVALID;
38 const uint32_t kThoroughTestChunkSize = 1 << 24;
40 class StreamingUtf8ValidatorThoroughTest : public ::testing::Test {
42 StreamingUtf8ValidatorThoroughTest()
43 : tasks_dispatched_(0), tasks_finished_(0) {}
45 // This uses the same logic as base::IsStringUTF8 except it considers
46 // non-characters valid (and doesn't require a string as input).
47 static bool IsStringUtf8(const char* src, int32_t src_len) {
48 int32_t char_index = 0;
50 while (char_index < src_len) {
51 base_icu::UChar32 code_point;
52 U8_NEXT(src, char_index, src_len, code_point);
53 if (!base::IsValidCodepoint(code_point))
59 // Converts the passed-in integer to a 4 byte string and then
60 // verifies that IsStringUtf8 and StreamingUtf8Validator agree on
61 // whether it is valid UTF-8 or not.
62 void TestNumber(uint32_t n) const {
64 memcpy(test, &n, sizeof n);
65 StreamingUtf8Validator validator;
66 EXPECT_EQ(IsStringUtf8(test, sizeof n),
67 validator.AddBytes(test, sizeof n) == VALID_ENDPOINT)
68 << "Difference of opinion for \""
69 << base::StringPrintf("\\x%02X\\x%02X\\x%02X\\x%02X",
73 test[3] & 0xFF) << "\"";
77 // Tests the 4-byte sequences corresponding to the |size| integers
78 // starting at |begin|. This is intended to be run from a worker
79 // pool. Signals |all_done_| at the end if it thinks all tasks are
81 void TestRange(uint32_t begin, uint32_t size) {
82 for (uint32_t i = 0; i < size; ++i) {
83 TestNumber(begin + i);
85 base::AutoLock al(lock_);
87 LOG(INFO) << tasks_finished_ << " / " << tasks_dispatched_
93 int tasks_dispatched_;
97 // Enable locally to verify that this class accepts exactly the same set of
98 // 4-byte strings as ICU-based validation. This tests every possible 4-byte
99 // string, so it is too slow to run routinely on low-powered machines.
100 TEST_F(StreamingUtf8ValidatorThoroughTest, DISABLED_TestEverything) {
101 base::test::TaskEnvironment task_environment;
103 base::AutoLock al(lock_);
106 base::ThreadPool::PostTask(
107 FROM_HERE, {base::TaskShutdownBehavior::BLOCK_SHUTDOWN},
108 base::BindOnce(&StreamingUtf8ValidatorThoroughTest::TestRange,
109 base::Unretained(this), begin,
110 kThoroughTestChunkSize));
112 begin += kThoroughTestChunkSize;
113 } while (begin != 0);
117 // These valid and invalid UTF-8 sequences are based on the tests from
118 // base/strings/string_util_unittest.cc
120 // All of the strings in |valid| must represent a single codepoint, because
121 // partial sequences are constructed by taking non-empty prefixes of these
123 const char* const valid[] = {"\r", "\n", "a",
124 "\xc2\x81", "\xe1\x80\xbf", "\xf1\x80\xa0\xbf",
125 "\xef\xbb\xbf", // UTF-8 BOM
128 const char* const* const valid_end = valid + std::size(valid);
130 const char* const invalid[] = {
131 // always invalid bytes
133 "\xf5", "\xf6", "\xf7",
134 "\xf8", "\xf9", "\xfa", "\xfb", "\xfc", "\xfd", "\xfe", "\xff",
135 // surrogate code points
136 "\xed\xa0\x80", "\xed\x0a\x8f", "\xed\xbf\xbf",
138 // overlong sequences
139 "\xc0\x80", // U+0000
142 "\xe0\x80\x80", // U+0000
143 "\xe0\x82\x80", // U+0080
144 "\xe0\x9f\xbf", // U+07ff
145 "\xf0\x80\x80\x8D", // U+000D
146 "\xf0\x80\x82\x91", // U+0091
147 "\xf0\x80\xa0\x80", // U+0800
148 "\xf0\x8f\xbb\xbf", // U+FEFF (BOM)
149 "\xf8\x80\x80\x80\xbf", // U+003F
150 "\xfc\x80\x80\x80\xa0\xa5",
153 "\xf4\x90\x80\x80", // U+110000
154 "\xf8\xa0\xbf\x80\xbf", // 5 bytes
155 "\xfc\x9c\xbf\x80\xbf\x80", // 6 bytes
157 // BOMs in UTF-16(BE|LE)
158 "\xfe\xff", "\xff\xfe",
161 const char* const* const invalid_end = invalid + std::size(invalid);
163 // A ForwardIterator which returns all the non-empty prefixes of the elements of
165 class PartialIterator {
167 // The constructor returns the first iterator, ie. it is equivalent to
169 PartialIterator() : index_(0), prefix_length_(0) { Advance(); }
170 // The trivial destructor left intentionally undefined.
171 // This is a value type; the default copy constructor and assignment operator
172 // generated by the compiler are used.
174 static PartialIterator end() { return PartialIterator(std::size(valid), 1); }
176 PartialIterator& operator++() {
181 base::StringPiece operator*() const {
182 return base::StringPiece(valid[index_], prefix_length_);
185 bool operator==(const PartialIterator& rhs) const {
186 return index_ == rhs.index_ && prefix_length_ == rhs.prefix_length_;
189 bool operator!=(const PartialIterator& rhs) const { return !(rhs == *this); }
192 // This constructor is used by the end() method.
193 PartialIterator(size_t index, size_t prefix_length)
194 : index_(index), prefix_length_(prefix_length) {}
197 if (index_ < std::size(valid) && prefix_length_ < strlen(valid[index_]))
199 while (index_ < std::size(valid) &&
200 prefix_length_ == strlen(valid[index_])) {
206 // The UTF-8 sequence, as an offset into the |valid| array.
208 size_t prefix_length_;
211 // A test fixture for tests which test one UTF-8 sequence (or invalid
212 // byte sequence) at a time.
213 class StreamingUtf8ValidatorSingleSequenceTest : public ::testing::Test {
215 // Iterator must be convertible when de-referenced to StringPiece.
216 template <typename Iterator>
217 void CheckRange(Iterator begin,
219 StreamingUtf8Validator::State expected) {
220 for (Iterator it = begin; it != end; ++it) {
221 StreamingUtf8Validator validator;
222 base::StringPiece sequence = *it;
224 validator.AddBytes(sequence.data(), sequence.size()))
225 << "Failed for \"" << sequence << "\"";
229 // Adding input a byte at a time should make absolutely no difference.
230 template <typename Iterator>
231 void CheckRangeByteAtATime(Iterator begin,
233 StreamingUtf8Validator::State expected) {
234 for (Iterator it = begin; it != end; ++it) {
235 StreamingUtf8Validator validator;
236 base::StringPiece sequence = *it;
237 StreamingUtf8Validator::State state = VALID_ENDPOINT;
238 for (const auto& cit : sequence) {
239 state = validator.AddBytes(&cit, 1);
241 EXPECT_EQ(expected, state) << "Failed for \"" << sequence << "\"";
246 // A test fixture for tests which test the concatenation of byte sequences.
247 class StreamingUtf8ValidatorDoubleSequenceTest : public ::testing::Test {
249 // Check every possible concatenation of byte sequences from two
250 // ranges, and verify that the combination matches the expected
252 template <typename Iterator1, typename Iterator2>
253 void CheckCombinations(Iterator1 begin1,
257 StreamingUtf8Validator::State expected) {
258 StreamingUtf8Validator validator;
259 for (Iterator1 it1 = begin1; it1 != end1; ++it1) {
260 base::StringPiece c1 = *it1;
261 for (Iterator2 it2 = begin2; it2 != end2; ++it2) {
262 base::StringPiece c2 = *it2;
263 validator.AddBytes(c1.data(), c1.size());
264 EXPECT_EQ(expected, validator.AddBytes(c2.data(), c2.size()))
265 << "Failed for \"" << c1 << c2 << "\"";
272 TEST(StreamingUtf8ValidatorTest, NothingIsValid) {
273 static const char kNothing[] = "";
274 EXPECT_EQ(VALID_ENDPOINT, StreamingUtf8Validator().AddBytes(kNothing, 0));
277 // Because the members of the |valid| array need to be non-zero length
278 // sequences and are measured with strlen(), |valid| cannot be used it
279 // to test the NUL character '\0', so the NUL character gets its own
281 TEST(StreamingUtf8ValidatorTest, NulIsValid) {
282 static const char kNul[] = "\x00";
283 EXPECT_EQ(VALID_ENDPOINT, StreamingUtf8Validator().AddBytes(kNul, 1));
286 // Just a basic sanity test before we start getting fancy.
287 TEST(StreamingUtf8ValidatorTest, HelloWorld) {
288 static const char kHelloWorld[] = "Hello, World!";
291 StreamingUtf8Validator().AddBytes(kHelloWorld, strlen(kHelloWorld)));
294 // Check that the Reset() method works.
295 TEST(StreamingUtf8ValidatorTest, ResetWorks) {
296 StreamingUtf8Validator validator;
297 EXPECT_EQ(INVALID, validator.AddBytes("\xC0", 1));
298 EXPECT_EQ(INVALID, validator.AddBytes("a", 1));
300 EXPECT_EQ(VALID_ENDPOINT, validator.AddBytes("a", 1));
303 TEST_F(StreamingUtf8ValidatorSingleSequenceTest, Valid) {
304 CheckRange(valid, valid_end, VALID_ENDPOINT);
307 TEST_F(StreamingUtf8ValidatorSingleSequenceTest, Partial) {
308 CheckRange(PartialIterator(), PartialIterator::end(), VALID_MIDPOINT);
311 TEST_F(StreamingUtf8ValidatorSingleSequenceTest, Invalid) {
312 CheckRange(invalid, invalid_end, INVALID);
315 TEST_F(StreamingUtf8ValidatorSingleSequenceTest, ValidByByte) {
316 CheckRangeByteAtATime(valid, valid_end, VALID_ENDPOINT);
319 TEST_F(StreamingUtf8ValidatorSingleSequenceTest, PartialByByte) {
320 CheckRangeByteAtATime(
321 PartialIterator(), PartialIterator::end(), VALID_MIDPOINT);
324 TEST_F(StreamingUtf8ValidatorSingleSequenceTest, InvalidByByte) {
325 CheckRangeByteAtATime(invalid, invalid_end, INVALID);
328 TEST_F(StreamingUtf8ValidatorDoubleSequenceTest, ValidPlusValidIsValid) {
329 CheckCombinations(valid, valid_end, valid, valid_end, VALID_ENDPOINT);
332 TEST_F(StreamingUtf8ValidatorDoubleSequenceTest, ValidPlusPartialIsPartial) {
333 CheckCombinations(valid,
336 PartialIterator::end(),
340 TEST_F(StreamingUtf8ValidatorDoubleSequenceTest, PartialPlusValidIsInvalid) {
342 PartialIterator(), PartialIterator::end(), valid, valid_end, INVALID);
345 TEST_F(StreamingUtf8ValidatorDoubleSequenceTest, PartialPlusPartialIsInvalid) {
346 CheckCombinations(PartialIterator(),
347 PartialIterator::end(),
349 PartialIterator::end(),
353 TEST_F(StreamingUtf8ValidatorDoubleSequenceTest, ValidPlusInvalidIsInvalid) {
354 CheckCombinations(valid, valid_end, invalid, invalid_end, INVALID);
357 TEST_F(StreamingUtf8ValidatorDoubleSequenceTest, InvalidPlusValidIsInvalid) {
358 CheckCombinations(invalid, invalid_end, valid, valid_end, INVALID);
361 TEST_F(StreamingUtf8ValidatorDoubleSequenceTest, InvalidPlusInvalidIsInvalid) {
362 CheckCombinations(invalid, invalid_end, invalid, invalid_end, INVALID);
365 TEST_F(StreamingUtf8ValidatorDoubleSequenceTest, InvalidPlusPartialIsInvalid) {
367 invalid, invalid_end, PartialIterator(), PartialIterator::end(), INVALID);
370 TEST_F(StreamingUtf8ValidatorDoubleSequenceTest, PartialPlusInvalidIsInvalid) {
372 PartialIterator(), PartialIterator::end(), invalid, invalid_end, INVALID);
375 TEST(StreamingUtf8ValidatorValidateTest, EmptyIsValid) {
376 EXPECT_TRUE(StreamingUtf8Validator::Validate(std::string()));
379 TEST(StreamingUtf8ValidatorValidateTest, SimpleValidCase) {
380 EXPECT_TRUE(StreamingUtf8Validator::Validate("\xc2\x81"));
383 TEST(StreamingUtf8ValidatorValidateTest, SimpleInvalidCase) {
384 EXPECT_FALSE(StreamingUtf8Validator::Validate("\xc0\x80"));
387 TEST(StreamingUtf8ValidatorValidateTest, TruncatedIsInvalid) {
388 EXPECT_FALSE(StreamingUtf8Validator::Validate("\xc2"));