CPP: Replace direct UTF-8 with escape sequences.
authorphilip.liard@gmail.com <philip.liard@gmail.com@ee073f10-1060-11df-b6a4-87a95322a99c>
Mon, 4 Jul 2011 15:09:31 +0000 (15:09 +0000)
committerphilip.liard@gmail.com <philip.liard@gmail.com@ee073f10-1060-11df-b6a4-87a95322a99c>
Mon, 4 Jul 2011 15:09:31 +0000 (15:09 +0000)
git-svn-id: http://libphonenumber.googlecode.com/svn/trunk@289 ee073f10-1060-11df-b6a4-87a95322a99c

cpp/src/phonenumbers/phonenumberutil.cc
cpp/src/phonenumbers/phonenumberutil_test.cc
cpp/src/phonenumbers/regexp_adapter_test.cc

index 00893d12979b45de07f817567d641d8e18090c1f..690147c381d253dc006e97bb4d1d8c758af6fd4a 100644 (file)
@@ -62,7 +62,7 @@ using std::stringstream;
 using google::protobuf::RepeatedPtrField;
 
 // static
-const char PhoneNumberUtil::kPlusChars[] = "++";
+const char PhoneNumberUtil::kPlusChars[] = "+\xEF\xBC\x8B";  /* "++" */
 // To find out the unicode code-point of the characters below in vim, highlight
 // the character and type 'ga'. Note that the - is used to express ranges of
 // full-width punctuation below, as well as being present in the expression
@@ -70,7 +70,10 @@ const char PhoneNumberUtil::kPlusChars[] = "++";
 // unicode character.
 // static
 const char PhoneNumberUtil::kValidPunctuation[] =
-    "-x‐-―−ー--/  ​⁠ ()()[].\\[\\]/~⁓∼";
+ /* "-x‐-―−ー--/  <U+200B><U+2060> ()()[].\\[\\]/~⁓∼" */
+    "-x\xE2\x80\x90-\xE2\x80\x95\xE2\x88\x92\xE3\x83\xBC\xEF\xBC\x8D-\xEF\xBC"
+    "\x8F \xC2\xA0\xE2\x80\x8B\xE2\x81\xA0\xE3\x80\x80()\xEF\xBC\x88\xEF\xBC"
+    "\x89\xEF\xBC\xBB\xEF\xBC\xBD.\\[\\]/~\xE2\x81\x93\xE2\x88\xBC";
 
 namespace {
 
@@ -450,35 +453,35 @@ void InitializeStaticMapsAndSets() {
   all_plus_number_grouping_symbols->insert(
       make_pair(ToUnicodeCodepoint("-"), '-'));
   all_plus_number_grouping_symbols->insert(
-      make_pair(ToUnicodeCodepoint("-"), '-'));
+      make_pair(ToUnicodeCodepoint("\xEF\xBC\x8D" /* "-" */), '-'));
   all_plus_number_grouping_symbols->insert(
-      make_pair(ToUnicodeCodepoint("‐"), '-'));
+      make_pair(ToUnicodeCodepoint("\xE2\x80\x90" /* "‐" */), '-'));
   all_plus_number_grouping_symbols->insert(
-      make_pair(ToUnicodeCodepoint("‑"), '-'));
+      make_pair(ToUnicodeCodepoint("\xE2\x80\x91" /* "‑" */), '-'));
   all_plus_number_grouping_symbols->insert(
-      make_pair(ToUnicodeCodepoint("‒"), '-'));
+      make_pair(ToUnicodeCodepoint("\xE2\x80\x92" /* "‒" */), '-'));
   all_plus_number_grouping_symbols->insert(
-      make_pair(ToUnicodeCodepoint("–"), '-'));
+      make_pair(ToUnicodeCodepoint("\xE2\x80\x93" /* "–" */), '-'));
   all_plus_number_grouping_symbols->insert(
-      make_pair(ToUnicodeCodepoint("—"), '-'));
+      make_pair(ToUnicodeCodepoint("\xE2\x80\x94" /* "—" */), '-'));
   all_plus_number_grouping_symbols->insert(
-      make_pair(ToUnicodeCodepoint("―"), '-'));
+      make_pair(ToUnicodeCodepoint("\xE2\x80\x95" /* "―" */), '-'));
   all_plus_number_grouping_symbols->insert(
-      make_pair(ToUnicodeCodepoint("−"), '-'));
+      make_pair(ToUnicodeCodepoint("\xE2\x88\x92" /* "−" */), '-'));
   all_plus_number_grouping_symbols->insert(
       make_pair(ToUnicodeCodepoint("/"), '/'));
   all_plus_number_grouping_symbols->insert(
-      make_pair(ToUnicodeCodepoint("/"), '/'));
+      make_pair(ToUnicodeCodepoint("\xEF\xBC\x8F" /* "/" */), '/'));
   all_plus_number_grouping_symbols->insert(
       make_pair(ToUnicodeCodepoint(" "), ' '));
   all_plus_number_grouping_symbols->insert(
-      make_pair(ToUnicodeCodepoint(" "), ' '));
+      make_pair(ToUnicodeCodepoint("\xE3\x80\x80" /* " " */), ' '));
   all_plus_number_grouping_symbols->insert(
-      make_pair(ToUnicodeCodepoint(""), ' '));
+      make_pair(ToUnicodeCodepoint("\xE2\x81\xA0"), ' '));
   all_plus_number_grouping_symbols->insert(
       make_pair(ToUnicodeCodepoint("."), '.'));
   all_plus_number_grouping_symbols->insert(
-      make_pair(ToUnicodeCodepoint("."), '.'));
+      make_pair(ToUnicodeCodepoint("\xEF\xBC\x8E" /* "." */), '.'));
   // Only the upper-case letters are added here - the lower-case versions are
   // added programmatically.
   alpha_mappings->insert(make_pair(ToUnicodeCodepoint("A"), '2'));
@@ -692,7 +695,8 @@ PhoneNumberUtil* PhoneNumberUtil::GetInstance() {
 
 void PhoneNumberUtil::CreateRegularExpressions() const {
   unique_international_prefix.reset(RegExp::Create(
-      "[\\d]+(?:[~⁓∼~][\\d]+)?"));
+   /* "[\\d]+(?:[~⁓∼~][\\d]+)?" */
+      "[\\d]+(?:[~\xE2\x81\x93\xE2\x88\xBC\xEF\xBD\x9E][\\d]+)?"));
   // The first_group_capturing_pattern was originally set to $1 but there are
   // some countries for which the first group is not used in the national
   // pattern (e.g. Argentina) so the $1 group does not match correctly.
@@ -720,10 +724,16 @@ void PhoneNumberUtil::CreateRegularExpressions() const {
   const string capturing_extn_digits = StrCat("([", kDigits, "]{1,7})");
   known_extn_patterns.reset(new string(
       StrCat(kRfc3966ExtnPrefix, capturing_extn_digits, "|"
-             "[  \\t,]*(?:ext(?:ensi(?:ó?|ó))?n?|extn?|[,xx##~~]|"
+          /* "[  \\t,]*(?:ext(?:ensi(?:ó?|ó))?n?|extn?|[,xx##~~]|"
              "int|int|anexo)"
-             "[:\\..]?[  \\t,-]*", capturing_extn_digits, "#?|"
-             "[- ]+([", kDigits, "]{1,5})#")));
+             "[:\\..]?[  \\t,-]*", capturing_extn_digits, "#?|" */
+             "[ \xC2\xA0\\t,]*(?:ext(?:ensi(?:o\xCC\x81?|\xC3\xB3))?n?|\xEF\xBD"
+             "\x85\xEF\xBD\x98\xEF\xBD\x94\xEF\xBD\x8E?|[,x\xEF\xBD\x98#\xEF"
+             "\xBC\x83~\xEF\xBD\x9E]|"
+             "int|\xEF\xBD\x89\xEF\xBD\x8E\xEF\xBD\x94|anexo)"
+             "[:\\.\xEF\xBC\x8E]?[ \xC2\xA0\\t,-]*", capturing_extn_digits,
+             "#?|[- ]+([", kDigits, "]{1,5})#")));
+
   extn_pattern.reset(RegExp::Create(
       StrCat("(?i)(?:", *known_extn_patterns, ")$")));
   valid_phone_number_pattern.reset(RegExp::Create(
index 2badb9664ae169c3a0255dd907a039df9e219c3a..ce206cecc62a454e2e00a6f83439a8e4769d4772 100644 (file)
@@ -1145,11 +1145,15 @@ TEST_F(PhoneNumberUtilTest, ExtractPossibleNumber) {
   ExtractPossibleNumber("Tel:+800-345-600", &extracted_number);
   EXPECT_EQ("+800-345-600", extracted_number);
   // Should recognise wide digits as possible start values.
-  ExtractPossibleNumber("023", &extracted_number);
-  EXPECT_EQ("023", extracted_number);
+  ExtractPossibleNumber("\xEF\xBC\x90\xEF\xBC\x92\xEF\xBC\x93" /* "023" */,
+                        &extracted_number);
+  EXPECT_EQ("\xEF\xBC\x90\xEF\xBC\x92\xEF\xBC\x93" /* "023" */,
+            extracted_number);
   // Dashes are not possible start values and should be removed.
-  ExtractPossibleNumber("Num-123", &extracted_number);
-  EXPECT_EQ("123", extracted_number);
+  ExtractPossibleNumber("Num-\xEF\xBC\x91\xEF\xBC\x92\xEF\xBC\x93"
+                        /* "Num-123" */, &extracted_number);
+  EXPECT_EQ("\xEF\xBC\x91\xEF\xBC\x92\xEF\xBC\x93" /* "123" */,
+            extracted_number);
   // If not possible number present, return empty string.
   ExtractPossibleNumber("Num-....", &extracted_number);
   EXPECT_EQ("", extracted_number);
@@ -1163,7 +1167,8 @@ TEST_F(PhoneNumberUtilTest, ExtractPossibleNumber) {
   ExtractPossibleNumber("(650) 253-0000.", &extracted_number);
   EXPECT_EQ("650) 253-0000", extracted_number);
   // This case has a trailing RTL char.
-  ExtractPossibleNumber("(650) 253-0000‏", &extracted_number);
+  ExtractPossibleNumber("(650) 253-0000\xE2\x80\x8F"
+                        /* "(650) 253-0000‏" */, &extracted_number);
   EXPECT_EQ("650) 253-0000", extracted_number);
 }
 
@@ -1675,13 +1680,15 @@ TEST_F(PhoneNumberUtilTest, IsViablePhoneNumber) {
   EXPECT_TRUE(IsViablePhoneNumber("0800-4-PIZZA"));
   // Only one or two digits before possible punctuation followed by more digits.
   // The punctuation used here is the unicode character u+3000.
-  EXPECT_TRUE(IsViablePhoneNumber("1 34"));
-  EXPECT_FALSE(IsViablePhoneNumber("1 3+4"));
+  EXPECT_TRUE(IsViablePhoneNumber("1\xE3\x80\x80" "34" /* "1 34" */));
+  EXPECT_FALSE(IsViablePhoneNumber("1\xE3\x80\x80" "3+4" /* "1 3+4" */));
   // Unicode variants of possible starting character and other allowed
   // punctuation/digits.
-  EXPECT_TRUE(IsViablePhoneNumber("(1) 3456789"));
+  EXPECT_TRUE(IsViablePhoneNumber("\xEF\xBC\x88" "1\xEF\xBC\x89\xE3\x80\x80"
+                                  "3456789" /* "(1) 3456789" */ ));
   // Testing a leading + is okay.
-  EXPECT_TRUE(IsViablePhoneNumber("+1) 3456789"));
+  EXPECT_TRUE(IsViablePhoneNumber("+1\xEF\xBC\x89\xE3\x80\x80"
+                                  "3456789" /* "+1) 3456789" */));
 }
 
 TEST_F(PhoneNumberUtilTest, ConvertAlphaCharactersInNumber) {
@@ -1692,8 +1699,10 @@ TEST_F(PhoneNumberUtilTest, ConvertAlphaCharactersInNumber) {
   EXPECT_EQ(kExpectedOutput, input);
 
   // Try with some non-ASCII characters.
-  input.assign("1 (800) ABC-DEF");
-  static const string kExpectedFullwidthOutput = "1 (800) 222-333";
+  input.assign("1\xE3\x80\x80\xEF\xBC\x88" "800) ABC-DEF"
+               /* "1 (800) ABCD-DEF" */);
+  static const string kExpectedFullwidthOutput =
+      "1\xE3\x80\x80\xEF\xBC\x88" "800) 222-333" /* "1 (800) 222-333" */;
   phone_util_.ConvertAlphaCharactersInNumber(&input);
   EXPECT_EQ(kExpectedFullwidthOutput, input);
 }
@@ -1717,13 +1726,13 @@ TEST_F(PhoneNumberUtilTest, NormaliseReplaceAlphaCharacters) {
 TEST_F(PhoneNumberUtilTest, NormaliseOtherDigits) {
   // The first digit is a full-width 2, the last digit is an Arabic-indic digit
   // 5.
-  string input_number("25٥");
+  string input_number("\xEF\xBC\x92" "5\xD9\xA5" /* "25٥" */);
   Normalize(&input_number);
   static const string kExpectedOutput("255");
   EXPECT_EQ(kExpectedOutput, input_number)
       << "Conversion did not correctly replace non-latin digits";
   // The first digit is an Eastern-Arabic 5, the latter an Eastern-Arabic 0.
-  string eastern_arabic_input_number("۵2۰");
+  string eastern_arabic_input_number("\xDB\xB5" "2\xDB\xB0" /* "۵2۰" */);
   Normalize(&eastern_arabic_input_number);
   static const string kExpectedOutput2("520");
   EXPECT_EQ(kExpectedOutput2, eastern_arabic_input_number)
@@ -2379,21 +2388,32 @@ TEST_F(PhoneNumberUtilTest, ParseWithInternationalPrefixes) {
   // Using a full-width plus sign.
   test_number.Clear();
   EXPECT_EQ(PhoneNumberUtil::NO_PARSING_ERROR,
-            phone_util_.Parse("+1 (650) 333-6000",
+            phone_util_.Parse("\xEF\xBC\x8B" "1 (650) 333-6000",
+                              /* "+1 (650) 333-6000" */
                               RegionCode::SG(), &test_number));
   EXPECT_EQ(us_number, test_number);
   // The whole number, including punctuation, is here represented in full-width
   // form.
   test_number.Clear();
   EXPECT_EQ(PhoneNumberUtil::NO_PARSING_ERROR,
-            phone_util_.Parse("+1 (650) 333-6000",
+            phone_util_.Parse("\xEF\xBC\x8B\xEF\xBC\x91\xE3\x80\x80\xEF\xBC\x88"
+                              "\xEF\xBC\x96\xEF\xBC\x95\xEF\xBC\x90\xEF\xBC\x89"
+                              "\xE3\x80\x80\xEF\xBC\x93\xEF\xBC\x93\xEF\xBC\x93"
+                              "\xEF\xBC\x8D\xEF\xBC\x96\xEF\xBC\x90\xEF\xBC\x90"
+                              "\xEF\xBC\x90",
+                              /* "+1 (650) 333-6000" */
                               RegionCode::SG(), &test_number));
   EXPECT_EQ(us_number, test_number);
 
   // Using the U+30FC dash.
   test_number.Clear();
   EXPECT_EQ(PhoneNumberUtil::NO_PARSING_ERROR,
-            phone_util_.Parse("+1 (650) 333ー6000",
+            phone_util_.Parse("\xEF\xBC\x8B\xEF\xBC\x91\xE3\x80\x80\xEF\xBC\x88"
+                              "\xEF\xBC\x96\xEF\xBC\x95\xEF\xBC\x90\xEF\xBC\x89"
+                              "\xE3\x80\x80\xEF\xBC\x93\xEF\xBC\x93\xEF\xBC\x93"
+                              "\xE3\x83\xBC\xEF\xBC\x96\xEF\xBC\x90\xEF\xBC\x90"
+                              "\xEF\xBC\x90",
+                              /* "+1 (650) 333ー6000" */
                               RegionCode::SG(), &test_number));
   EXPECT_EQ(us_number, test_number);
 }
@@ -2633,8 +2653,9 @@ TEST_F(PhoneNumberUtilTest, ParseNumbersWithPlusWithNoRegion) {
   // Test with full-width plus.
   result_proto.Clear();
   EXPECT_EQ(PhoneNumberUtil::NO_PARSING_ERROR,
-            phone_util_.Parse("+64 3 331 6005", RegionCode::GetUnknown(),
-                              &result_proto));
+            phone_util_.Parse("\xEF\xBC\x8B" "64 3 331 6005",
+                              /* "+64 3 331 6005" */
+                              RegionCode::GetUnknown(), &result_proto));
   EXPECT_EQ(nz_number, result_proto);
   // Test with normal plus but leading characters that need to be stripped.
   EXPECT_EQ(PhoneNumberUtil::NO_PARSING_ERROR,
@@ -2792,7 +2813,8 @@ TEST_F(PhoneNumberUtilTest, ParseExtensions) {
   EXPECT_EQ(us_with_extension, test_number);
   test_number.Clear();
   EXPECT_EQ(PhoneNumberUtil::NO_PARSING_ERROR,
-            phone_util_.Parse("(800) 901-3355 ,extensión 7246433",
+            phone_util_.Parse("(800) 901-3355 ,extensi\xC3\xB3n 7246433",
+                              /* "(800) 901-3355 ,extensión 7246433" */
                               RegionCode::US(),
                               &test_number));
   EXPECT_EQ(us_with_extension, test_number);
@@ -2800,7 +2822,8 @@ TEST_F(PhoneNumberUtilTest, ParseExtensions) {
   // Repeat with the small letter o with acute accent created by combining
   // characters.
   EXPECT_EQ(PhoneNumberUtil::NO_PARSING_ERROR,
-            phone_util_.Parse("(800) 901-3355 ,extensión 7246433",
+            phone_util_.Parse("(800) 901-3355 ,extensio\xCC\x81n 7246433",
+                              /* "(800) 901-3355 ,extensión 7246433" */
                               RegionCode::US(),
                               &test_number));
   EXPECT_EQ(us_with_extension, test_number);
index 14fbd40784cda5f176dcb1161d8fc74ef565429f..2b03342245221d3707399a501b2cb3457a38aa17 100644 (file)
@@ -186,12 +186,18 @@ TEST_F(RegExpAdapterTest, TestGlobalReplace) {
 }
 
 TEST(RegExpAdapter, TestUtf8) {
-  const scoped_ptr<const RegExp> reg_exp(RegExp::Create("℡⊏([α-ω]*)⊐"));
+  const scoped_ptr<const RegExp> reg_exp(RegExp::Create(
+      "\xE2\x84\xA1\xE2\x8A\x8F([\xCE\xB1-\xCF\x89]*)\xE2\x8A\x90"
+      /* "℡⊏([α-ω]*)⊐" */));
   string matched;
 
-  EXPECT_FALSE(reg_exp->Match("℡⊏123⊐", true, &matched));
-  EXPECT_TRUE(reg_exp->Match("℡⊏αβ⊐", true, &matched));
-  EXPECT_EQ("αβ", matched);
+  EXPECT_FALSE(reg_exp->Match(
+      "\xE2\x84\xA1\xE2\x8A\x8F" "123\xE2\x8A\x90" /* "℡⊏123⊐" */, true,
+      &matched));
+  EXPECT_TRUE(reg_exp->Match(
+      "\xE2\x84\xA1\xE2\x8A\x8F\xCE\xB1\xCE\xB2\xE2\x8A\x90"
+      /* "℡⊏αβ⊐" */, true, &matched));
+  EXPECT_EQ("\xCE\xB1\xCE\xB2" /* "αβ" */, matched);
 }
 
 }  // namespace phonenumbers