base/i18n/streaming_utf8_validator_unittest.cc

   1 // Copyright 2014 The Chromium Authors
   2 // Use of this source code is governed by a BSD-style license that can be
   3 // found in the LICENSE file.
   4
   5 #include "base/i18n/streaming_utf8_validator.h"
   6
   7 #include <stddef.h>
   8 #include <stdint.h>
   9 #include <stdio.h>
  10 #include <string.h>
  11
  12 #include <string>
  13
  14 #include "base/functional/bind.h"
  15 #include "base/location.h"
  16 #include "base/logging.h"
  17 #include "base/memory/ref_counted.h"
  18 #include "base/strings/string_piece.h"
  19 #include "base/strings/string_util.h"
  20 #include "base/strings/stringprintf.h"
  21 #include "base/strings/utf_string_conversion_utils.h"
  22 #include "base/synchronization/lock.h"
  23 #include "base/task/thread_pool.h"
  24 #include "base/test/task_environment.h"
  25 #include "testing/gtest/include/gtest/gtest.h"
  26 #include "third_party/icu/source/common/unicode/utf8.h"
  27
  28 namespace base {
  29 namespace {
  30
  31 // Avoid having to qualify the enum values in the tests.
  32 const StreamingUtf8Validator::State VALID_ENDPOINT =
  33     StreamingUtf8Validator::VALID_ENDPOINT;
  34 const StreamingUtf8Validator::State VALID_MIDPOINT =
  35     StreamingUtf8Validator::VALID_MIDPOINT;
  36 const StreamingUtf8Validator::State INVALID = StreamingUtf8Validator::INVALID;
  37
  38 const uint32_t kThoroughTestChunkSize = 1 << 24;
  39
  40 class StreamingUtf8ValidatorThoroughTest : public ::testing::Test {
  41  protected:
  42   StreamingUtf8ValidatorThoroughTest()
  43       : tasks_dispatched_(0), tasks_finished_(0) {}
  44
  45   // This uses the same logic as base::IsStringUTF8 except it considers
  46   // non-characters valid (and doesn't require a string as input).
  47   static bool IsStringUtf8(const char* src, int32_t src_len) {
  48     int32_t char_index = 0;
  49
  50     while (char_index < src_len) {
  51       base_icu::UChar32 code_point;
  52       U8_NEXT(src, char_index, src_len, code_point);
  53       if (!base::IsValidCodepoint(code_point))
  54         return false;
  55     }
  56     return true;
  57   }
  58
  59   // Converts the passed-in integer to a 4 byte string and then
  60   // verifies that IsStringUtf8 and StreamingUtf8Validator agree on
  61   // whether it is valid UTF-8 or not.
  62   void TestNumber(uint32_t n) const {
  63     char test[sizeof n];
  64     memcpy(test, &n, sizeof n);
  65     StreamingUtf8Validator validator;
  66     EXPECT_EQ(IsStringUtf8(test, sizeof n),
  67               validator.AddBytes(test, sizeof n) == VALID_ENDPOINT)
  68         << "Difference of opinion for \""
  69         << base::StringPrintf("\\x%02X\\x%02X\\x%02X\\x%02X",
  70                               test[0] & 0xFF,
  71                               test[1] & 0xFF,
  72                               test[2] & 0xFF,
  73                               test[3] & 0xFF) << "\"";
  74   }
  75
  76  public:
  77   // Tests the 4-byte sequences corresponding to the |size| integers
  78   // starting at |begin|. This is intended to be run from a worker
  79   // pool. Signals |all_done_| at the end if it thinks all tasks are
  80   // finished.
  81   void TestRange(uint32_t begin, uint32_t size) {
  82     for (uint32_t i = 0; i < size; ++i) {
  83       TestNumber(begin + i);
  84     }
  85     base::AutoLock al(lock_);
  86     ++tasks_finished_;
  87     LOG(INFO) << tasks_finished_ << " / " << tasks_dispatched_
  88               << " tasks done\n";
  89   }
  90
  91  protected:
  92   base::Lock lock_;
  93   int tasks_dispatched_;
  94   int tasks_finished_;
  95 };
  96
  97 // Enable locally to verify that this class accepts exactly the same set of
  98 // 4-byte strings as ICU-based validation. This tests every possible 4-byte
  99 // string, so it is too slow to run routinely on low-powered machines.
 100 TEST_F(StreamingUtf8ValidatorThoroughTest, DISABLED_TestEverything) {
 101   base::test::TaskEnvironment task_environment;
 102   {
 103     base::AutoLock al(lock_);
 104     uint32_t begin = 0;
 105     do {
 106       base::ThreadPool::PostTask(
 107           FROM_HERE, {base::TaskShutdownBehavior::BLOCK_SHUTDOWN},
 108           base::BindOnce(&StreamingUtf8ValidatorThoroughTest::TestRange,
 109                          base::Unretained(this), begin,
 110                          kThoroughTestChunkSize));
 111       ++tasks_dispatched_;
 112       begin += kThoroughTestChunkSize;
 113     } while (begin != 0);
 114   }
 115 }
 116
 117 // These valid and invalid UTF-8 sequences are based on the tests from
 118 // base/strings/string_util_unittest.cc
 119
 120 // All of the strings in |valid| must represent a single codepoint, because
 121 // partial sequences are constructed by taking non-empty prefixes of these
 122 // strings.
 123 const char* const valid[] = {"\r",           "\n",           "a",
 124                              "\xc2\x81",     "\xe1\x80\xbf", "\xf1\x80\xa0\xbf",
 125                              "\xef\xbb\xbf",  // UTF-8 BOM
 126 };
 127
 128 const char* const* const valid_end = valid + std::size(valid);
 129
 130 const char* const invalid[] = {
 131     // always invalid bytes
 132     "\xc0", "\xc1",
 133     "\xf5", "\xf6", "\xf7",
 134     "\xf8", "\xf9", "\xfa", "\xfb", "\xfc", "\xfd", "\xfe", "\xff",
 135     // surrogate code points
 136     "\xed\xa0\x80", "\xed\x0a\x8f", "\xed\xbf\xbf",
 137     //
 138     // overlong sequences
 139     "\xc0\x80",              // U+0000
 140     "\xc1\x80",              // "A"
 141     "\xc1\x81",              // "B"
 142     "\xe0\x80\x80",          // U+0000
 143     "\xe0\x82\x80",          // U+0080
 144     "\xe0\x9f\xbf",          // U+07ff
 145     "\xf0\x80\x80\x8D",      // U+000D
 146     "\xf0\x80\x82\x91",      // U+0091
 147     "\xf0\x80\xa0\x80",      // U+0800
 148     "\xf0\x8f\xbb\xbf",      // U+FEFF (BOM)
 149     "\xf8\x80\x80\x80\xbf",  // U+003F
 150     "\xfc\x80\x80\x80\xa0\xa5",
 151     //
 152     // Beyond U+10FFFF
 153     "\xf4\x90\x80\x80",          // U+110000
 154     "\xf8\xa0\xbf\x80\xbf",      // 5 bytes
 155     "\xfc\x9c\xbf\x80\xbf\x80",  // 6 bytes
 156     //
 157     // BOMs in UTF-16(BE|LE)
 158     "\xfe\xff", "\xff\xfe",
 159 };
 160
 161 const char* const* const invalid_end = invalid + std::size(invalid);
 162
 163 // A ForwardIterator which returns all the non-empty prefixes of the elements of
 164 // "valid".
 165 class PartialIterator {
 166  public:
 167   // The constructor returns the first iterator, ie. it is equivalent to
 168   // begin().
 169   PartialIterator() : index_(0), prefix_length_(0) { Advance(); }
 170   // The trivial destructor left intentionally undefined.
 171   // This is a value type; the default copy constructor and assignment operator
 172   // generated by the compiler are used.
 173
 174   static PartialIterator end() { return PartialIterator(std::size(valid), 1); }
 175
 176   PartialIterator& operator++() {
 177     Advance();
 178     return *this;
 179   }
 180
 181   base::StringPiece operator*() const {
 182     return base::StringPiece(valid[index_], prefix_length_);
 183   }
 184
 185   bool operator==(const PartialIterator& rhs) const {
 186     return index_ == rhs.index_ && prefix_length_ == rhs.prefix_length_;
 187   }
 188
 189   bool operator!=(const PartialIterator& rhs) const { return !(rhs == *this); }
 190
 191  private:
 192   // This constructor is used by the end() method.
 193   PartialIterator(size_t index, size_t prefix_length)
 194       : index_(index), prefix_length_(prefix_length) {}
 195
 196   void Advance() {
 197     if (index_ < std::size(valid) && prefix_length_ < strlen(valid[index_]))
 198       ++prefix_length_;
 199     while (index_ < std::size(valid) &&
 200            prefix_length_ == strlen(valid[index_])) {
 201       ++index_;
 202       prefix_length_ = 1;
 203     }
 204   }
 205
 206   // The UTF-8 sequence, as an offset into the |valid| array.
 207   size_t index_;
 208   size_t prefix_length_;
 209 };
 210
 211 // A test fixture for tests which test one UTF-8 sequence (or invalid
 212 // byte sequence) at a time.
 213 class StreamingUtf8ValidatorSingleSequenceTest : public ::testing::Test {
 214  protected:
 215   // Iterator must be convertible when de-referenced to StringPiece.
 216   template <typename Iterator>
 217   void CheckRange(Iterator begin,
 218                   Iterator end,
 219                   StreamingUtf8Validator::State expected) {
 220     for (Iterator it = begin; it != end; ++it) {
 221       StreamingUtf8Validator validator;
 222       base::StringPiece sequence = *it;
 223       EXPECT_EQ(expected,
 224                 validator.AddBytes(sequence.data(), sequence.size()))
 225           << "Failed for \"" << sequence << "\"";
 226     }
 227   }
 228
 229   // Adding input a byte at a time should make absolutely no difference.
 230   template <typename Iterator>
 231   void CheckRangeByteAtATime(Iterator begin,
 232                              Iterator end,
 233                              StreamingUtf8Validator::State expected) {
 234     for (Iterator it = begin; it != end; ++it) {
 235       StreamingUtf8Validator validator;
 236       base::StringPiece sequence = *it;
 237       StreamingUtf8Validator::State state = VALID_ENDPOINT;
 238       for (const auto& cit : sequence) {
 239         state = validator.AddBytes(&cit, 1);
 240       }
 241       EXPECT_EQ(expected, state) << "Failed for \"" << sequence << "\"";
 242     }
 243   }
 244 };
 245
 246 // A test fixture for tests which test the concatenation of byte sequences.
 247 class StreamingUtf8ValidatorDoubleSequenceTest : public ::testing::Test {
 248  protected:
 249   // Check every possible concatenation of byte sequences from two
 250   // ranges, and verify that the combination matches the expected
 251   // state.
 252   template <typename Iterator1, typename Iterator2>
 253   void CheckCombinations(Iterator1 begin1,
 254                          Iterator1 end1,
 255                          Iterator2 begin2,
 256                          Iterator2 end2,
 257                          StreamingUtf8Validator::State expected) {
 258     StreamingUtf8Validator validator;
 259     for (Iterator1 it1 = begin1; it1 != end1; ++it1) {
 260       base::StringPiece c1 = *it1;
 261       for (Iterator2 it2 = begin2; it2 != end2; ++it2) {
 262         base::StringPiece c2 = *it2;
 263         validator.AddBytes(c1.data(), c1.size());
 264         EXPECT_EQ(expected, validator.AddBytes(c2.data(), c2.size()))
 265             << "Failed for \"" << c1 << c2 << "\"";
 266         validator.Reset();
 267       }
 268     }
 269   }
 270 };
 271
 272 TEST(StreamingUtf8ValidatorTest, NothingIsValid) {
 273   static const char kNothing[] = "";
 274   EXPECT_EQ(VALID_ENDPOINT, StreamingUtf8Validator().AddBytes(kNothing, 0));
 275 }
 276
 277 // Because the members of the |valid| array need to be non-zero length
 278 // sequences and are measured with strlen(), |valid| cannot be used it
 279 // to test the NUL character '\0', so the NUL character gets its own
 280 // test.
 281 TEST(StreamingUtf8ValidatorTest, NulIsValid) {
 282   static const char kNul[] = "\x00";
 283   EXPECT_EQ(VALID_ENDPOINT, StreamingUtf8Validator().AddBytes(kNul, 1));
 284 }
 285
 286 // Just a basic sanity test before we start getting fancy.
 287 TEST(StreamingUtf8ValidatorTest, HelloWorld) {
 288   static const char kHelloWorld[] = "Hello, World!";
 289   EXPECT_EQ(
 290       VALID_ENDPOINT,
 291       StreamingUtf8Validator().AddBytes(kHelloWorld, strlen(kHelloWorld)));
 292 }
 293
 294 // Check that the Reset() method works.
 295 TEST(StreamingUtf8ValidatorTest, ResetWorks) {
 296   StreamingUtf8Validator validator;
 297   EXPECT_EQ(INVALID, validator.AddBytes("\xC0", 1));
 298   EXPECT_EQ(INVALID, validator.AddBytes("a", 1));
 299   validator.Reset();
 300   EXPECT_EQ(VALID_ENDPOINT, validator.AddBytes("a", 1));
 301 }
 302
 303 TEST_F(StreamingUtf8ValidatorSingleSequenceTest, Valid) {
 304   CheckRange(valid, valid_end, VALID_ENDPOINT);
 305 }
 306
 307 TEST_F(StreamingUtf8ValidatorSingleSequenceTest, Partial) {
 308   CheckRange(PartialIterator(), PartialIterator::end(), VALID_MIDPOINT);
 309 }
 310
 311 TEST_F(StreamingUtf8ValidatorSingleSequenceTest, Invalid) {
 312   CheckRange(invalid, invalid_end, INVALID);
 313 }
 314
 315 TEST_F(StreamingUtf8ValidatorSingleSequenceTest, ValidByByte) {
 316   CheckRangeByteAtATime(valid, valid_end, VALID_ENDPOINT);
 317 }
 318
 319 TEST_F(StreamingUtf8ValidatorSingleSequenceTest, PartialByByte) {
 320   CheckRangeByteAtATime(
 321       PartialIterator(), PartialIterator::end(), VALID_MIDPOINT);
 322 }
 323
 324 TEST_F(StreamingUtf8ValidatorSingleSequenceTest, InvalidByByte) {
 325   CheckRangeByteAtATime(invalid, invalid_end, INVALID);
 326 }
 327
 328 TEST_F(StreamingUtf8ValidatorDoubleSequenceTest, ValidPlusValidIsValid) {
 329   CheckCombinations(valid, valid_end, valid, valid_end, VALID_ENDPOINT);
 330 }
 331
 332 TEST_F(StreamingUtf8ValidatorDoubleSequenceTest, ValidPlusPartialIsPartial) {
 333   CheckCombinations(valid,
 334                     valid_end,
 335                     PartialIterator(),
 336                     PartialIterator::end(),
 337                     VALID_MIDPOINT);
 338 }
 339
 340 TEST_F(StreamingUtf8ValidatorDoubleSequenceTest, PartialPlusValidIsInvalid) {
 341   CheckCombinations(
 342       PartialIterator(), PartialIterator::end(), valid, valid_end, INVALID);
 343 }
 344
 345 TEST_F(StreamingUtf8ValidatorDoubleSequenceTest, PartialPlusPartialIsInvalid) {
 346   CheckCombinations(PartialIterator(),
 347                     PartialIterator::end(),
 348                     PartialIterator(),
 349                     PartialIterator::end(),
 350                     INVALID);
 351 }
 352
 353 TEST_F(StreamingUtf8ValidatorDoubleSequenceTest, ValidPlusInvalidIsInvalid) {
 354   CheckCombinations(valid, valid_end, invalid, invalid_end, INVALID);
 355 }
 356
 357 TEST_F(StreamingUtf8ValidatorDoubleSequenceTest, InvalidPlusValidIsInvalid) {
 358   CheckCombinations(invalid, invalid_end, valid, valid_end, INVALID);
 359 }
 360
 361 TEST_F(StreamingUtf8ValidatorDoubleSequenceTest, InvalidPlusInvalidIsInvalid) {
 362   CheckCombinations(invalid, invalid_end, invalid, invalid_end, INVALID);
 363 }
 364
 365 TEST_F(StreamingUtf8ValidatorDoubleSequenceTest, InvalidPlusPartialIsInvalid) {
 366   CheckCombinations(
 367       invalid, invalid_end, PartialIterator(), PartialIterator::end(), INVALID);
 368 }
 369
 370 TEST_F(StreamingUtf8ValidatorDoubleSequenceTest, PartialPlusInvalidIsInvalid) {
 371   CheckCombinations(
 372       PartialIterator(), PartialIterator::end(), invalid, invalid_end, INVALID);
 373 }
 374
 375 TEST(StreamingUtf8ValidatorValidateTest, EmptyIsValid) {
 376   EXPECT_TRUE(StreamingUtf8Validator::Validate(std::string()));
 377 }
 378
 379 TEST(StreamingUtf8ValidatorValidateTest, SimpleValidCase) {
 380   EXPECT_TRUE(StreamingUtf8Validator::Validate("\xc2\x81"));
 381 }
 382
 383 TEST(StreamingUtf8ValidatorValidateTest, SimpleInvalidCase) {
 384   EXPECT_FALSE(StreamingUtf8Validator::Validate("\xc0\x80"));
 385 }
 386
 387 TEST(StreamingUtf8ValidatorValidateTest, TruncatedIsInvalid) {
 388   EXPECT_FALSE(StreamingUtf8Validator::Validate("\xc2"));
 389 }
 390
 391 }  // namespace
 392 }  // namespace base