base/i18n/streaming_utf8_validator_unittest.cc

   1 // Copyright 2014 The Chromium Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style license that can be
   3 // found in the LICENSE file.
   4
   5 #include "base/i18n/streaming_utf8_validator.h"
   6
   7 #include <stddef.h>
   8 #include <stdint.h>
   9 #include <stdio.h>
  10 #include <string.h>
  11
  12 #include <string>
  13
  14 #include "base/macros.h"
  15 #include "base/strings/string_piece.h"
  16 #include "testing/gtest/include/gtest/gtest.h"
  17
  18 // Define BASE_I18N_UTF8_VALIDATOR_THOROUGH_TEST to verify that this class
  19 // accepts exactly the same set of 4-byte strings as ICU-based validation. This
  20 // tests every possible 4-byte string, so it is too slow to run routinely on
  21 // low-powered machines.
  22 //
  23 // #define BASE_I18N_UTF8_VALIDATOR_THOROUGH_TEST
  24
  25 #ifdef BASE_I18N_UTF8_VALIDATOR_THOROUGH_TEST
  26
  27 #include "base/bind.h"
  28 #include "base/location.h"
  29 #include "base/logging.h"
  30 #include "base/memory/ref_counted.h"
  31 #include "base/strings/string_util.h"
  32 #include "base/strings/stringprintf.h"
  33 #include "base/strings/utf_string_conversion_utils.h"
  34 #include "base/synchronization/lock.h"
  35 #include "base/task_scheduler/post_task.h"
  36 #include "base/task_scheduler/task_scheduler.h"
  37 #include "third_party/icu/source/common/unicode/utf8.h"
  38
  39 #endif  // BASE_I18N_UTF8_VALIDATOR_THOROUGH_TEST
  40
  41 namespace base {
  42 namespace {
  43
  44 // Avoid having to qualify the enum values in the tests.
  45 const StreamingUtf8Validator::State VALID_ENDPOINT =
  46     StreamingUtf8Validator::VALID_ENDPOINT;
  47 const StreamingUtf8Validator::State VALID_MIDPOINT =
  48     StreamingUtf8Validator::VALID_MIDPOINT;
  49 const StreamingUtf8Validator::State INVALID = StreamingUtf8Validator::INVALID;
  50
  51 #ifdef BASE_I18N_UTF8_VALIDATOR_THOROUGH_TEST
  52
  53 const uint32_t kThoroughTestChunkSize = 1 << 24;
  54
  55 class StreamingUtf8ValidatorThoroughTest : public ::testing::Test {
  56  protected:
  57   StreamingUtf8ValidatorThoroughTest()
  58       : tasks_dispatched_(0), tasks_finished_(0) {}
  59
  60   // This uses the same logic as base::IsStringUTF8 except it considers
  61   // non-characters valid (and doesn't require a string as input).
  62   static bool IsStringUtf8(const char* src, int32_t src_len) {
  63     int32_t char_index = 0;
  64
  65     while (char_index < src_len) {
  66       int32_t code_point;
  67       U8_NEXT(src, char_index, src_len, code_point);
  68       if (!base::IsValidCodepoint(code_point))
  69         return false;
  70     }
  71     return true;
  72   }
  73
  74   // Converts the passed-in integer to a 4 byte string and then
  75   // verifies that IsStringUtf8 and StreamingUtf8Validator agree on
  76   // whether it is valid UTF-8 or not.
  77   void TestNumber(uint32_t n) const {
  78     char test[sizeof n];
  79     memcpy(test, &n, sizeof n);
  80     StreamingUtf8Validator validator;
  81     EXPECT_EQ(IsStringUtf8(test, sizeof n),
  82               validator.AddBytes(test, sizeof n) == VALID_ENDPOINT)
  83         << "Difference of opinion for \""
  84         << base::StringPrintf("\\x%02X\\x%02X\\x%02X\\x%02X",
  85                               test[0] & 0xFF,
  86                               test[1] & 0xFF,
  87                               test[2] & 0xFF,
  88                               test[3] & 0xFF) << "\"";
  89   }
  90
  91  public:
  92   // Tests the 4-byte sequences corresponding to the |size| integers
  93   // starting at |begin|. This is intended to be run from a worker
  94   // pool. Signals |all_done_| at the end if it thinks all tasks are
  95   // finished.
  96   void TestRange(uint32_t begin, uint32_t size) {
  97     for (uint32_t i = 0; i < size; ++i) {
  98       TestNumber(begin + i);
  99     }
 100     base::AutoLock al(lock_);
 101     ++tasks_finished_;
 102     LOG(INFO) << tasks_finished_ << " / " << tasks_dispatched_
 103               << " tasks done\n";
 104   }
 105
 106  protected:
 107   base::Lock lock_;
 108   int tasks_dispatched_;
 109   int tasks_finished_;
 110 };
 111
 112 TEST_F(StreamingUtf8ValidatorThoroughTest, TestEverything) {
 113   base::TaskScheduler::CreateAndStartWithDefaultParams(
 114       "StreamingUtf8ValidatorThoroughTest");
 115   {
 116     base::AutoLock al(lock_);
 117     uint32_t begin = 0;
 118     do {
 119       base::PostTaskWithTraits(
 120           FROM_HERE, {base::TaskShutdownBehavior::BLOCK_SHUTDOWN},
 121           base::BindOnce(&StreamingUtf8ValidatorThoroughTest::TestRange,
 122                          base::Unretained(this), begin,
 123                          kThoroughTestChunkSize));
 124       ++tasks_dispatched_;
 125       begin += kThoroughTestChunkSize;
 126     } while (begin != 0);
 127   }
 128   base::TaskScheduler::GetInstance()->Shutdown();
 129   base::TaskScheduler::GetInstance()->JoinForTesting();
 130   base::TaskScheduler::SetInstance(nullptr);
 131 }
 132
 133 #endif  // BASE_I18N_UTF8_VALIDATOR_THOROUGH_TEST
 134
 135 // These valid and invalid UTF-8 sequences are based on the tests from
 136 // base/strings/string_util_unittest.cc
 137
 138 // All of the strings in |valid| must represent a single codepoint, because
 139 // partial sequences are constructed by taking non-empty prefixes of these
 140 // strings.
 141 const char* const valid[] = {"\r",           "\n",           "a",
 142                              "\xc2\x81",     "\xe1\x80\xbf", "\xf1\x80\xa0\xbf",
 143                              "\xef\xbb\xbf",  // UTF-8 BOM
 144 };
 145
 146 const char* const* const valid_end = valid + arraysize(valid);
 147
 148 const char* const invalid[] = {
 149     // always invalid bytes
 150     "\xc0", "\xc1",
 151     "\xf5", "\xf6", "\xf7",
 152     "\xf8", "\xf9", "\xfa", "\xfb", "\xfc", "\xfd", "\xfe", "\xff",
 153     // surrogate code points
 154     "\xed\xa0\x80", "\xed\x0a\x8f", "\xed\xbf\xbf",
 155     //
 156     // overlong sequences
 157     "\xc0\x80",              // U+0000
 158     "\xc1\x80",              // "A"
 159     "\xc1\x81",              // "B"
 160     "\xe0\x80\x80",          // U+0000
 161     "\xe0\x82\x80",          // U+0080
 162     "\xe0\x9f\xbf",          // U+07ff
 163     "\xf0\x80\x80\x8D",      // U+000D
 164     "\xf0\x80\x82\x91",      // U+0091
 165     "\xf0\x80\xa0\x80",      // U+0800
 166     "\xf0\x8f\xbb\xbf",      // U+FEFF (BOM)
 167     "\xf8\x80\x80\x80\xbf",  // U+003F
 168     "\xfc\x80\x80\x80\xa0\xa5",
 169     //
 170     // Beyond U+10FFFF
 171     "\xf4\x90\x80\x80",          // U+110000
 172     "\xf8\xa0\xbf\x80\xbf",      // 5 bytes
 173     "\xfc\x9c\xbf\x80\xbf\x80",  // 6 bytes
 174     //
 175     // BOMs in UTF-16(BE|LE)
 176     "\xfe\xff", "\xff\xfe",
 177 };
 178
 179 const char* const* const invalid_end = invalid + arraysize(invalid);
 180
 181 // A ForwardIterator which returns all the non-empty prefixes of the elements of
 182 // "valid".
 183 class PartialIterator {
 184  public:
 185   // The constructor returns the first iterator, ie. it is equivalent to
 186   // begin().
 187   PartialIterator() : index_(0), prefix_length_(0) { Advance(); }
 188   // The trivial destructor left intentionally undefined.
 189   // This is a value type; the default copy constructor and assignment operator
 190   // generated by the compiler are used.
 191
 192   static PartialIterator end() { return PartialIterator(arraysize(valid), 1); }
 193
 194   PartialIterator& operator++() {
 195     Advance();
 196     return *this;
 197   }
 198
 199   base::StringPiece operator*() const {
 200     return base::StringPiece(valid[index_], prefix_length_);
 201   }
 202
 203   bool operator==(const PartialIterator& rhs) const {
 204     return index_ == rhs.index_ && prefix_length_ == rhs.prefix_length_;
 205   }
 206
 207   bool operator!=(const PartialIterator& rhs) const { return !(rhs == *this); }
 208
 209  private:
 210   // This constructor is used by the end() method.
 211   PartialIterator(size_t index, size_t prefix_length)
 212       : index_(index), prefix_length_(prefix_length) {}
 213
 214   void Advance() {
 215     if (index_ < arraysize(valid) && prefix_length_ < strlen(valid[index_]))
 216       ++prefix_length_;
 217     while (index_ < arraysize(valid) &&
 218            prefix_length_ == strlen(valid[index_])) {
 219       ++index_;
 220       prefix_length_ = 1;
 221     }
 222   }
 223
 224   // The UTF-8 sequence, as an offset into the |valid| array.
 225   size_t index_;
 226   size_t prefix_length_;
 227 };
 228
 229 // A test fixture for tests which test one UTF-8 sequence (or invalid
 230 // byte sequence) at a time.
 231 class StreamingUtf8ValidatorSingleSequenceTest : public ::testing::Test {
 232  protected:
 233   // Iterator must be convertible when de-referenced to StringPiece.
 234   template <typename Iterator>
 235   void CheckRange(Iterator begin,
 236                   Iterator end,
 237                   StreamingUtf8Validator::State expected) {
 238     for (Iterator it = begin; it != end; ++it) {
 239       StreamingUtf8Validator validator;
 240       base::StringPiece sequence = *it;
 241       EXPECT_EQ(expected,
 242                 validator.AddBytes(sequence.data(), sequence.size()))
 243           << "Failed for \"" << sequence << "\"";
 244     }
 245   }
 246
 247   // Adding input a byte at a time should make absolutely no difference.
 248   template <typename Iterator>
 249   void CheckRangeByteAtATime(Iterator begin,
 250                              Iterator end,
 251                              StreamingUtf8Validator::State expected) {
 252     for (Iterator it = begin; it != end; ++it) {
 253       StreamingUtf8Validator validator;
 254       base::StringPiece sequence = *it;
 255       StreamingUtf8Validator::State state = VALID_ENDPOINT;
 256       for (base::StringPiece::const_iterator cit = sequence.begin();
 257            cit != sequence.end();
 258            ++cit) {
 259         state = validator.AddBytes(&*cit, 1);
 260       }
 261       EXPECT_EQ(expected, state) << "Failed for \"" << sequence << "\"";
 262     }
 263   }
 264 };
 265
 266 // A test fixture for tests which test the concatenation of byte sequences.
 267 class StreamingUtf8ValidatorDoubleSequenceTest : public ::testing::Test {
 268  protected:
 269   // Check every possible concatenation of byte sequences from two
 270   // ranges, and verify that the combination matches the expected
 271   // state.
 272   template <typename Iterator1, typename Iterator2>
 273   void CheckCombinations(Iterator1 begin1,
 274                          Iterator1 end1,
 275                          Iterator2 begin2,
 276                          Iterator2 end2,
 277                          StreamingUtf8Validator::State expected) {
 278     StreamingUtf8Validator validator;
 279     for (Iterator1 it1 = begin1; it1 != end1; ++it1) {
 280       base::StringPiece c1 = *it1;
 281       for (Iterator2 it2 = begin2; it2 != end2; ++it2) {
 282         base::StringPiece c2 = *it2;
 283         validator.AddBytes(c1.data(), c1.size());
 284         EXPECT_EQ(expected, validator.AddBytes(c2.data(), c2.size()))
 285             << "Failed for \"" << c1 << c2 << "\"";
 286         validator.Reset();
 287       }
 288     }
 289   }
 290 };
 291
 292 TEST(StreamingUtf8ValidatorTest, NothingIsValid) {
 293   static const char kNothing[] = "";
 294   EXPECT_EQ(VALID_ENDPOINT, StreamingUtf8Validator().AddBytes(kNothing, 0));
 295 }
 296
 297 // Because the members of the |valid| array need to be non-zero length
 298 // sequences and are measured with strlen(), |valid| cannot be used it
 299 // to test the NUL character '\0', so the NUL character gets its own
 300 // test.
 301 TEST(StreamingUtf8ValidatorTest, NulIsValid) {
 302   static const char kNul[] = "\x00";
 303   EXPECT_EQ(VALID_ENDPOINT, StreamingUtf8Validator().AddBytes(kNul, 1));
 304 }
 305
 306 // Just a basic sanity test before we start getting fancy.
 307 TEST(StreamingUtf8ValidatorTest, HelloWorld) {
 308   static const char kHelloWorld[] = "Hello, World!";
 309   EXPECT_EQ(
 310       VALID_ENDPOINT,
 311       StreamingUtf8Validator().AddBytes(kHelloWorld, strlen(kHelloWorld)));
 312 }
 313
 314 // Check that the Reset() method works.
 315 TEST(StreamingUtf8ValidatorTest, ResetWorks) {
 316   StreamingUtf8Validator validator;
 317   EXPECT_EQ(INVALID, validator.AddBytes("\xC0", 1));
 318   EXPECT_EQ(INVALID, validator.AddBytes("a", 1));
 319   validator.Reset();
 320   EXPECT_EQ(VALID_ENDPOINT, validator.AddBytes("a", 1));
 321 }
 322
 323 TEST_F(StreamingUtf8ValidatorSingleSequenceTest, Valid) {
 324   CheckRange(valid, valid_end, VALID_ENDPOINT);
 325 }
 326
 327 TEST_F(StreamingUtf8ValidatorSingleSequenceTest, Partial) {
 328   CheckRange(PartialIterator(), PartialIterator::end(), VALID_MIDPOINT);
 329 }
 330
 331 TEST_F(StreamingUtf8ValidatorSingleSequenceTest, Invalid) {
 332   CheckRange(invalid, invalid_end, INVALID);
 333 }
 334
 335 TEST_F(StreamingUtf8ValidatorSingleSequenceTest, ValidByByte) {
 336   CheckRangeByteAtATime(valid, valid_end, VALID_ENDPOINT);
 337 }
 338
 339 TEST_F(StreamingUtf8ValidatorSingleSequenceTest, PartialByByte) {
 340   CheckRangeByteAtATime(
 341       PartialIterator(), PartialIterator::end(), VALID_MIDPOINT);
 342 }
 343
 344 TEST_F(StreamingUtf8ValidatorSingleSequenceTest, InvalidByByte) {
 345   CheckRangeByteAtATime(invalid, invalid_end, INVALID);
 346 }
 347
 348 TEST_F(StreamingUtf8ValidatorDoubleSequenceTest, ValidPlusValidIsValid) {
 349   CheckCombinations(valid, valid_end, valid, valid_end, VALID_ENDPOINT);
 350 }
 351
 352 TEST_F(StreamingUtf8ValidatorDoubleSequenceTest, ValidPlusPartialIsPartial) {
 353   CheckCombinations(valid,
 354                     valid_end,
 355                     PartialIterator(),
 356                     PartialIterator::end(),
 357                     VALID_MIDPOINT);
 358 }
 359
 360 TEST_F(StreamingUtf8ValidatorDoubleSequenceTest, PartialPlusValidIsInvalid) {
 361   CheckCombinations(
 362       PartialIterator(), PartialIterator::end(), valid, valid_end, INVALID);
 363 }
 364
 365 TEST_F(StreamingUtf8ValidatorDoubleSequenceTest, PartialPlusPartialIsInvalid) {
 366   CheckCombinations(PartialIterator(),
 367                     PartialIterator::end(),
 368                     PartialIterator(),
 369                     PartialIterator::end(),
 370                     INVALID);
 371 }
 372
 373 TEST_F(StreamingUtf8ValidatorDoubleSequenceTest, ValidPlusInvalidIsInvalid) {
 374   CheckCombinations(valid, valid_end, invalid, invalid_end, INVALID);
 375 }
 376
 377 TEST_F(StreamingUtf8ValidatorDoubleSequenceTest, InvalidPlusValidIsInvalid) {
 378   CheckCombinations(invalid, invalid_end, valid, valid_end, INVALID);
 379 }
 380
 381 TEST_F(StreamingUtf8ValidatorDoubleSequenceTest, InvalidPlusInvalidIsInvalid) {
 382   CheckCombinations(invalid, invalid_end, invalid, invalid_end, INVALID);
 383 }
 384
 385 TEST_F(StreamingUtf8ValidatorDoubleSequenceTest, InvalidPlusPartialIsInvalid) {
 386   CheckCombinations(
 387       invalid, invalid_end, PartialIterator(), PartialIterator::end(), INVALID);
 388 }
 389
 390 TEST_F(StreamingUtf8ValidatorDoubleSequenceTest, PartialPlusInvalidIsInvalid) {
 391   CheckCombinations(
 392       PartialIterator(), PartialIterator::end(), invalid, invalid_end, INVALID);
 393 }
 394
 395 TEST(StreamingUtf8ValidatorValidateTest, EmptyIsValid) {
 396   EXPECT_TRUE(StreamingUtf8Validator::Validate(std::string()));
 397 }
 398
 399 TEST(StreamingUtf8ValidatorValidateTest, SimpleValidCase) {
 400   EXPECT_TRUE(StreamingUtf8Validator::Validate("\xc2\x81"));
 401 }
 402
 403 TEST(StreamingUtf8ValidatorValidateTest, SimpleInvalidCase) {
 404   EXPECT_FALSE(StreamingUtf8Validator::Validate("\xc0\x80"));
 405 }
 406
 407 TEST(StreamingUtf8ValidatorValidateTest, TruncatedIsInvalid) {
 408   EXPECT_FALSE(StreamingUtf8Validator::Validate("\xc2"));
 409 }
 410
 411 }  // namespace
 412 }  // namespace base