url/url_canon_icu_unittest.cc

   1 // Copyright 2014 The Chromium Authors
   2 // Use of this source code is governed by a BSD-style license that can be
   3 // found in the LICENSE file.
   4
   5 #include "url/url_canon_icu.h"
   6
   7 #include <stddef.h>
   8
   9 #include "base/logging.h"
  10 #include "base/memory/raw_ptr.h"
  11 #include "testing/gtest/include/gtest/gtest.h"
  12 #include "third_party/icu/source/common/unicode/ucnv.h"
  13 #include "url/url_canon.h"
  14 #include "url/url_canon_stdstring.h"
  15 #include "url/url_test_utils.h"
  16
  17 namespace url {
  18
  19 namespace {
  20
  21 // Wrapper around a UConverter object that managers creation and destruction.
  22 class UConvScoper {
  23  public:
  24   explicit UConvScoper(const char* charset_name) {
  25     UErrorCode err = U_ZERO_ERROR;
  26     converter_ = ucnv_open(charset_name, &err);
  27     if (!converter_) {
  28       LOG(ERROR) << "Failed to open charset " << charset_name << ": "
  29                  << u_errorName(err);
  30     }
  31   }
  32
  33   ~UConvScoper() {
  34     if (converter_)
  35       ucnv_close(converter_);
  36   }
  37
  38   // Returns the converter object, may be NULL.
  39   UConverter* converter() const { return converter_; }
  40
  41  private:
  42   raw_ptr<UConverter> converter_;
  43 };
  44
  45 TEST(URLCanonIcuTest, ICUCharsetConverter) {
  46   struct ICUCase {
  47     const wchar_t* input;
  48     const char* encoding;
  49     const char* expected;
  50   } icu_cases[] = {
  51       // UTF-8.
  52     {L"Hello, world", "utf-8", "Hello, world"},
  53     {L"\x4f60\x597d", "utf-8", "\xe4\xbd\xa0\xe5\xa5\xbd"},
  54       // Non-BMP UTF-8.
  55     {L"!\xd800\xdf00!", "utf-8", "!\xf0\x90\x8c\x80!"},
  56       // Big5
  57     {L"\x4f60\x597d", "big5", "\xa7\x41\xa6\x6e"},
  58       // Unrepresentable character in the destination set.
  59     {L"hello\x4f60\x06de\x597dworld", "big5",
  60       "hello\xa7\x41%26%231758%3B\xa6\x6eworld"},
  61   };
  62
  63   for (size_t i = 0; i < std::size(icu_cases); i++) {
  64     UConvScoper conv(icu_cases[i].encoding);
  65     ASSERT_TRUE(conv.converter() != NULL);
  66     ICUCharsetConverter converter(conv.converter());
  67
  68     std::string str;
  69     StdStringCanonOutput output(&str);
  70
  71     std::u16string input_str(
  72         test_utils::TruncateWStringToUTF16(icu_cases[i].input));
  73     int input_len = static_cast<int>(input_str.length());
  74     converter.ConvertFromUTF16(input_str.c_str(), input_len, &output);
  75     output.Complete();
  76
  77     EXPECT_STREQ(icu_cases[i].expected, str.c_str());
  78   }
  79
  80   // Test string sizes around the resize boundary for the output to make sure
  81   // the converter resizes as needed.
  82   const int static_size = 16;
  83   UConvScoper conv("utf-8");
  84   ASSERT_TRUE(conv.converter());
  85   ICUCharsetConverter converter(conv.converter());
  86   for (int i = static_size - 2; i <= static_size + 2; i++) {
  87     // Make a string with the appropriate length.
  88     std::u16string input;
  89     for (int ch = 0; ch < i; ch++)
  90       input.push_back('a');
  91
  92     RawCanonOutput<static_size> output;
  93     converter.ConvertFromUTF16(input.c_str(), static_cast<int>(input.length()),
  94                                &output);
  95     EXPECT_EQ(input.length(), output.length());
  96   }
  97 }
  98
  99 TEST(URLCanonIcuTest, QueryWithConverter) {
 100   struct QueryCase {
 101     const char* input8;
 102     const wchar_t* input16;
 103     const char* encoding;
 104     const char* expected;
 105   } query_cases[] = {
 106       // Regular ASCII case in some different encodings.
 107     {"foo=bar", L"foo=bar", "utf-8", "?foo=bar"},
 108     {"foo=bar", L"foo=bar", "shift_jis", "?foo=bar"},
 109     {"foo=bar", L"foo=bar", "gb2312", "?foo=bar"},
 110       // Chinese input/output
 111     {"q=\xe4\xbd\xa0\xe5\xa5\xbd", L"q=\x4f60\x597d", "gb2312",
 112       "?q=%C4%E3%BA%C3"},
 113     {"q=\xe4\xbd\xa0\xe5\xa5\xbd", L"q=\x4f60\x597d", "big5", "?q=%A7A%A6n"},
 114       // Unencodable character in the destination character set should be
 115       // escaped. The escape sequence unescapes to be the entity name:
 116       // "?q=&#20320;"
 117     {"q=Chinese\xef\xbc\xa7", L"q=Chinese\xff27", "iso-8859-1",
 118       "?q=Chinese%26%2365319%3B"},
 119   };
 120
 121   for (size_t i = 0; i < std::size(query_cases); i++) {
 122     Component out_comp;
 123
 124     UConvScoper conv(query_cases[i].encoding);
 125     ASSERT_TRUE(!query_cases[i].encoding || conv.converter());
 126     ICUCharsetConverter converter(conv.converter());
 127
 128     if (query_cases[i].input8) {
 129       int len = static_cast<int>(strlen(query_cases[i].input8));
 130       Component in_comp(0, len);
 131       std::string out_str;
 132
 133       StdStringCanonOutput output(&out_str);
 134       CanonicalizeQuery(query_cases[i].input8, in_comp, &converter, &output,
 135                         &out_comp);
 136       output.Complete();
 137
 138       EXPECT_EQ(query_cases[i].expected, out_str);
 139     }
 140
 141     if (query_cases[i].input16) {
 142       std::u16string input16(
 143           test_utils::TruncateWStringToUTF16(query_cases[i].input16));
 144       int len = static_cast<int>(input16.length());
 145       Component in_comp(0, len);
 146       std::string out_str;
 147
 148       StdStringCanonOutput output(&out_str);
 149       CanonicalizeQuery(input16.c_str(), in_comp, &converter, &output,
 150                         &out_comp);
 151       output.Complete();
 152
 153       EXPECT_EQ(query_cases[i].expected, out_str);
 154     }
 155   }
 156
 157   // Extra test for input with embedded NULL;
 158   std::string out_str;
 159   StdStringCanonOutput output(&out_str);
 160   Component out_comp;
 161   CanonicalizeQuery("a \x00z\x01", Component(0, 5), NULL, &output, &out_comp);
 162   output.Complete();
 163   EXPECT_EQ("?a%20%00z%01", out_str);
 164 }
 165
 166 }  // namespace
 167
 168 }  // namespace url