1 // Copyright (C) 2011 The Libphonenumber Authors
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
7 // http://www.apache.org/licenses/LICENSE-2.0
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
15 // Author: Lara Rennie
18 // Implementation of a stateful class that finds and extracts telephone numbers
21 #include "phonenumbers/phonenumbermatcher.h"
23 #ifndef I18N_PHONENUMBERS_USE_ICU_REGEXP
24 #error phonenumbermatcher depends on ICU \
25 (i.e. I18N_PHONENUMBERS_USE_ICU_REGEXP must be set)
26 #endif // I18N_PHONENUMBERS_USE_ICU_REGEXP
37 #include <unicode/uchar.h>
39 #include "phonenumbers/alternate_format.h"
40 #include "phonenumbers/base/logging.h"
41 #include "phonenumbers/base/memory/scoped_ptr.h"
42 #include "phonenumbers/base/memory/singleton.h"
43 #include "phonenumbers/callback.h"
44 #include "phonenumbers/default_logger.h"
45 #include "phonenumbers/encoding_utils.h"
46 #include "phonenumbers/normalize_utf8.h"
47 #include "phonenumbers/phonemetadata.pb.h"
48 #include "phonenumbers/phonenumber.pb.h"
49 #include "phonenumbers/phonenumbermatch.h"
50 #include "phonenumbers/phonenumberutil.h"
51 #include "phonenumbers/regexp_adapter.h"
52 #include "phonenumbers/regexp_adapter_icu.h"
53 #include "phonenumbers/stringutil.h"
55 #ifdef I18N_PHONENUMBERS_USE_RE2
56 #include "phonenumbers/regexp_adapter_re2.h"
57 #endif // I18N_PHONENUMBERS_USE_RE2_AND_ICU
63 using std::numeric_limits;
68 namespace phonenumbers {
71 // Returns a regular expression quantifier with an upper and lower limit.
72 string Limit(int lower, int upper) {
75 DCHECK_LT(lower, upper);
76 return StrCat("{", lower, ",", upper, "}");
79 bool IsInvalidPunctuationSymbol(char32 character) {
80 return character == '%' || u_charType(character) == U_CURRENCY_SYMBOL;
83 bool ContainsOnlyValidXChars(const PhoneNumber& number, const string& candidate,
84 const PhoneNumberUtil& util) {
85 // The characters 'x' and 'X' can be (1) a carrier code, in which case they
86 // always precede the national significant number or (2) an extension sign,
87 // in which case they always precede the extension number. We assume a
88 // carrier code is more than 1 digit, so the first case has to have more than
89 // 1 consecutive 'x' or 'X', whereas the second case can only have exactly 1
92 found = candidate.find_first_of("xX");
93 // We ignore the character if 'x' or 'X' appears as the last character of
95 while (found != string::npos && found < candidate.length() - 1) {
96 // We only look for 'x' or 'X' in ASCII form.
97 char next_char = candidate[found + 1];
98 if (next_char == 'x' || next_char == 'X') {
99 // This is the carrier code case, in which the 'X's always precede the
100 // national significant number.
102 if (util.IsNumberMatchWithOneString(
103 number, candidate.substr(found, candidate.length() - found))
104 != PhoneNumberUtil::NSN_MATCH) {
108 string normalized_extension(candidate.substr(found,
109 candidate.length() - found));
110 util.NormalizeDigitsOnly(&normalized_extension);
111 if (normalized_extension != number.extension()) {
115 found = candidate.find_first_of("xX", found + 1);
120 bool AllNumberGroupsRemainGrouped(
121 const PhoneNumberUtil& util,
122 const PhoneNumber& phone_number,
123 const string& normalized_candidate,
124 const vector<string>& formatted_number_groups) {
125 size_t from_index = 0;
126 // Check each group of consecutive digits are not broken into separate
127 // groupings in the normalized_candidate string.
128 for (size_t i = 0; i < formatted_number_groups.size(); ++i) {
129 // Fails if the substring of normalized_candidate starting from from_index
130 // doesn't contain the consecutive digits in formatted_number_groups.at(i).
131 from_index = normalized_candidate.find(formatted_number_groups.at(i),
133 if (from_index == string::npos) {
136 // Moves from_index forward.
137 from_index += formatted_number_groups.at(i).length();
138 if (i == 0 && from_index < normalized_candidate.length()) {
139 // We are at the position right after the NDC. Note although
140 // normalized_candidate might contain non-ASCII formatting characters,
141 // they won't be treated as ASCII digits when converted to a char.
142 if (isdigit(normalized_candidate.at(from_index))) {
143 // This means there is no formatting symbol after the NDC. In this case,
144 // we only accept the number if there is no formatting symbol at all in
145 // the number, except for extensions.
146 string national_significant_number;
147 util.GetNationalSignificantNumber(
148 phone_number, &national_significant_number);
149 return HasPrefixString(normalized_candidate.substr(
150 from_index - formatted_number_groups.at(i).length()),
151 national_significant_number);
155 // The check here makes sure that we haven't mistakenly already used the
156 // extension to match the last group of the subscriber number. Note the
157 // extension cannot have formatting in-between digits.
158 return normalized_candidate.substr(from_index)
159 .find(phone_number.extension()) != string::npos;
162 bool LoadAlternateFormats(PhoneMetadataCollection* alternate_formats) {
163 #if defined(I18N_PHONENUMBERS_USE_ALTERNATE_FORMATS)
164 if (!alternate_formats->ParseFromArray(alternate_format_get(),
165 alternate_format_size())) {
166 cerr << "Could not parse binary data." << endl;
177 class PhoneNumberMatcherRegExps : public Singleton<PhoneNumberMatcherRegExps> {
179 friend class Singleton<PhoneNumberMatcherRegExps>;
181 string opening_parens_;
182 string closing_parens_;
184 // Limit on the number of pairs of brackets in a phone number.
185 string bracket_pair_limit_;
186 // Helper strings for the matching_brackets_ pattern.
187 // An opening bracket at the beginning may not be closed, but subsequent ones
188 // should be. It's also possible that the leading bracket was dropped, so we
189 // shouldn't be surprised if we see a closing bracket first.
190 string leading_maybe_matched_bracket_;
191 string bracket_pairs_;
192 // Limit on the number of leading (plus) characters.
194 // Limit on the number of consecutive punctuation characters.
195 string punctuation_limit_;
196 // The maximum number of digits allowed in a digit-separated block. As we
197 // allow all digits in a single block, this should be set high enough to
198 // accommodate the entire national number and the international country code.
199 int digit_block_limit_;
200 // Limit on the number of blocks separated by punctuation. Uses
201 // kDigitBlockLimit since some formats use spaces to separate each digit.
203 // A punctuation sequence allowing white space.
205 // A digits block without punctuation.
206 string digit_sequence_;
207 // Punctuation that may be at the start of a phone number - brackets and plus
209 string lead_class_chars_;
210 // Same as lead_class_chars_, but enclosed as a character class.
212 // Extra helper strings that form part of pattern_. These are stored
213 // separately since StrCat has a limit of 12 args.
214 string opening_punctuation_;
215 string optional_extn_pattern_;
218 // We use two different reg-ex factories here for performance reasons. RE2 is
219 // much faster for smaller reg-ex patterns, but the main pattern cannot be
220 // handled by RE2 in an efficient way.
221 scoped_ptr<const AbstractRegExpFactory> regexp_factory_for_pattern_;
222 scoped_ptr<const AbstractRegExpFactory> regexp_factory_;
224 // Matches strings that look like publication pages. Example:
225 // Computing Complete Answers to Queries in the Presence of Limited Access
226 // Patterns. Chen Li. VLDB J. 12(3): 211-227 (2003).
228 // The string "211-227 (2003)" is not a telephone number.
229 scoped_ptr<const RegExp> pub_pages_;
230 // Matches strings that look like dates using "/" as a separator. Examples:
231 // 3/10/2011, 31/10/96 or 08/31/95.
232 scoped_ptr<const RegExp> slash_separated_dates_;
233 // Matches timestamps. Examples: "2012-01-02 08:00". Note that the reg-ex does
234 // not include trailing ":\d\d" -- that is covered by time_stamps_suffix_.
235 scoped_ptr<const RegExp> time_stamps_;
236 scoped_ptr<const RegExp> time_stamps_suffix_;
237 // Pattern to check that brackets match. Opening brackets should be closed
238 // within a phone number. This also checks that there is something inside the
239 // brackets. Having no brackets at all is also fine.
240 scoped_ptr<const RegExp> matching_brackets_;
241 // Matches white-space, which may indicate the end of a phone number and the
242 // start of something else (such as a neighbouring zip-code). If white-space
243 // is found, continues to match all characters that are not typically used to
244 // start a phone number.
245 scoped_ptr<const RegExp> group_separator_;
246 scoped_ptr<const RegExp> capture_up_to_second_number_start_pattern_;
247 scoped_ptr<const RegExp> capturing_ascii_digits_pattern_;
248 // Compiled reg-ex representing lead_class_;
249 scoped_ptr<const RegExp> lead_class_pattern_;
250 // Phone number pattern allowing optional punctuation.
251 scoped_ptr<const RegExp> pattern_;
253 PhoneNumberMatcherRegExps()
254 : opening_parens_("(\\[\xEF\xBC\x88\xEF\xBC\xBB" /* "(\\[([" */),
255 closing_parens_(")\\]\xEF\xBC\x89\xEF\xBC\xBD" /* ")\\])]" */),
256 non_parens_(StrCat("[^", opening_parens_, closing_parens_, "]")),
257 bracket_pair_limit_(Limit(0, 3)),
258 leading_maybe_matched_bracket_(StrCat(
259 "(?:[", opening_parens_, "])?",
260 "(?:", non_parens_, "+[", closing_parens_, "])?")),
261 bracket_pairs_(StrCat(
262 "(?:[", opening_parens_, "]", non_parens_, "+",
263 "[", closing_parens_, "])", bracket_pair_limit_)),
264 lead_limit_(Limit(0, 2)),
265 punctuation_limit_(Limit(0, 4)),
266 digit_block_limit_(PhoneNumberUtil::kMaxLengthForNsn +
267 PhoneNumberUtil::kMaxLengthCountryCode),
268 block_limit_(Limit(0, digit_block_limit_)),
269 punctuation_(StrCat("[", PhoneNumberUtil::kValidPunctuation, "]",
270 punctuation_limit_)),
271 digit_sequence_(StrCat("\\p{Nd}", Limit(1, digit_block_limit_))),
272 lead_class_chars_(StrCat(opening_parens_, PhoneNumberUtil::kPlusChars)),
273 lead_class_(StrCat("[", lead_class_chars_, "]")),
274 opening_punctuation_(StrCat("(?:", lead_class_, punctuation_, ")")),
275 optional_extn_pattern_(StrCat(
277 PhoneNumberUtil::GetInstance()->GetExtnPatternsForMatching(),
279 regexp_factory_for_pattern_(new ICURegExpFactory()),
280 #ifdef I18N_PHONENUMBERS_USE_RE2
281 regexp_factory_(new RE2RegExpFactory()),
283 regexp_factory_(new ICURegExpFactory()),
284 #endif // I18N_PHONENUMBERS_USE_RE2
285 pub_pages_(regexp_factory_->CreateRegExp(
286 "\\d{1,5}-+\\d{1,5}\\s{0,4}\\(\\d{1,4}")),
287 slash_separated_dates_(regexp_factory_->CreateRegExp(
288 "(?:(?:[0-3]?\\d/[01]?\\d)|"
289 "(?:[01]?\\d/[0-3]?\\d))/(?:[12]\\d)?\\d{2}")),
290 time_stamps_(regexp_factory_->CreateRegExp(
291 "[12]\\d{3}[-/]?[01]\\d[-/]?[0-3]\\d [0-2]\\d$")),
292 time_stamps_suffix_(regexp_factory_->CreateRegExp(":[0-5]\\d")),
293 matching_brackets_(regexp_factory_->CreateRegExp(
294 StrCat(leading_maybe_matched_bracket_, non_parens_, "+",
295 bracket_pairs_, non_parens_, "*"))),
296 group_separator_(regexp_factory_->CreateRegExp(
297 StrCat("\\p{Z}", "[^", lead_class_chars_, "\\p{Nd}]*"))),
298 capture_up_to_second_number_start_pattern_(
299 regexp_factory_->CreateRegExp(
300 PhoneNumberUtil::kCaptureUpToSecondNumberStart)),
301 capturing_ascii_digits_pattern_(
302 regexp_factory_->CreateRegExp("(\\d+)")),
303 lead_class_pattern_(regexp_factory_->CreateRegExp(lead_class_)),
304 pattern_(regexp_factory_for_pattern_->CreateRegExp(
305 StrCat("(", opening_punctuation_, lead_limit_,
306 digit_sequence_, "(?:", punctuation_, digit_sequence_, ")",
307 block_limit_, optional_extn_pattern_, ")"))) {
311 DISALLOW_COPY_AND_ASSIGN(PhoneNumberMatcherRegExps);
314 class AlternateFormats : public Singleton<AlternateFormats> {
316 PhoneMetadataCollection format_data_;
318 map<int, const PhoneMetadata*> calling_code_to_alternate_formats_map_;
322 calling_code_to_alternate_formats_map_() {
323 if (!LoadAlternateFormats(&format_data_)) {
324 LOG(DFATAL) << "Could not parse compiled-in metadata.";
327 for (RepeatedPtrField<PhoneMetadata>::const_iterator it =
328 format_data_.metadata().begin();
329 it != format_data_.metadata().end();
331 calling_code_to_alternate_formats_map_.insert(
332 make_pair(it->country_code(), &*it));
336 const PhoneMetadata* GetAlternateFormatsForCountry(int country_calling_code)
338 map<int, const PhoneMetadata*>::const_iterator it =
339 calling_code_to_alternate_formats_map_.find(country_calling_code);
340 if (it != calling_code_to_alternate_formats_map_.end()) {
347 DISALLOW_COPY_AND_ASSIGN(AlternateFormats);
350 PhoneNumberMatcher::PhoneNumberMatcher(const PhoneNumberUtil& util,
352 const string& region_code,
353 PhoneNumberMatcher::Leniency leniency,
355 : reg_exps_(PhoneNumberMatcherRegExps::GetInstance()),
356 alternate_formats_(AlternateFormats::GetInstance()),
359 preferred_region_(region_code),
361 max_tries_(max_tries),
367 PhoneNumberMatcher::PhoneNumberMatcher(const string& text,
368 const string& region_code)
369 : reg_exps_(PhoneNumberMatcherRegExps::GetInstance()),
370 alternate_formats_(NULL), // Not used.
371 phone_util_(*PhoneNumberUtil::GetInstance()),
373 preferred_region_(region_code),
375 max_tries_(numeric_limits<int>::max()),
381 PhoneNumberMatcher::~PhoneNumberMatcher() {
385 bool PhoneNumberMatcher::IsLatinLetter(char32 letter) {
386 // Combining marks are a subset of non-spacing-mark.
387 if (!u_isalpha(letter) && (u_charType(letter) != U_NON_SPACING_MARK)) {
390 UBlockCode block = ublock_getCode(letter);
391 return ((block == UBLOCK_BASIC_LATIN) ||
392 (block == UBLOCK_LATIN_1_SUPPLEMENT) ||
393 (block == UBLOCK_LATIN_EXTENDED_A) ||
394 (block == UBLOCK_LATIN_EXTENDED_ADDITIONAL) ||
395 (block == UBLOCK_LATIN_EXTENDED_B) ||
396 (block == UBLOCK_COMBINING_DIACRITICAL_MARKS));
399 bool PhoneNumberMatcher::ParseAndVerify(const string& candidate, int offset,
400 PhoneNumberMatch* match) {
402 // Check the candidate doesn't contain any formatting which would indicate
403 // that it really isn't a phone number.
404 if (!reg_exps_->matching_brackets_->FullMatch(candidate)) {
408 // If leniency is set to VALID or stricter, we also want to skip numbers that
409 // are surrounded by Latin alphabetic characters, to skip cases like
410 // abc8005001234 or 8005001234def.
411 if (leniency_ >= VALID) {
412 // If the candidate is not at the start of the text, and does not start with
413 // phone-number punctuation, check the previous character.
414 scoped_ptr<RegExpInput> candidate_input(
415 reg_exps_->regexp_factory_->CreateInput(candidate));
417 !reg_exps_->lead_class_pattern_->Consume(candidate_input.get())) {
418 char32 previous_char;
419 const char* previous_char_ptr =
420 EncodingUtils::BackUpOneUTF8Character(text_.c_str(),
421 text_.c_str() + offset);
422 EncodingUtils::DecodeUTF8Char(previous_char_ptr, &previous_char);
423 // We return false if it is a latin letter or an invalid punctuation
425 if (IsInvalidPunctuationSymbol(previous_char) ||
426 IsLatinLetter(previous_char)) {
430 size_t lastCharIndex = offset + candidate.length();
431 if (lastCharIndex < text_.length()) {
433 const char* next_char_ptr =
434 EncodingUtils::AdvanceOneUTF8Character(
435 text_.c_str() + lastCharIndex - 1);
436 EncodingUtils::DecodeUTF8Char(next_char_ptr, &next_char);
437 if (IsInvalidPunctuationSymbol(next_char) || IsLatinLetter(next_char)) {
444 if (phone_util_.ParseAndKeepRawInput(candidate, preferred_region_, &number) !=
445 PhoneNumberUtil::NO_PARSING_ERROR) {
448 if (VerifyAccordingToLeniency(leniency_, number, candidate)) {
449 match->set_start(offset);
450 match->set_raw_string(candidate);
451 // We used ParseAndKeepRawInput to create this number, but for now we don't
452 // return the extra values parsed. TODO: stop clearing all values here and
453 // switch all users over to using raw_input() rather than the raw_string()
454 // of PhoneNumberMatch.
455 number.clear_country_code_source();
456 number.clear_preferred_domestic_carrier_code();
457 number.clear_raw_input();
458 match->set_number(number);
464 // Helper method to replace the verification method for each enum in the Java
466 bool PhoneNumberMatcher::VerifyAccordingToLeniency(
467 Leniency leniency, const PhoneNumber& number,
468 const string& candidate) const {
470 case PhoneNumberMatcher::POSSIBLE:
471 return phone_util_.IsPossibleNumber(number);
472 case PhoneNumberMatcher::VALID:
473 if (!phone_util_.IsValidNumber(number) ||
474 !ContainsOnlyValidXChars(number, candidate, phone_util_)) {
477 return IsNationalPrefixPresentIfRequired(number);
478 case PhoneNumberMatcher::STRICT_GROUPING: {
479 if (!phone_util_.IsValidNumber(number) ||
480 !ContainsOnlyValidXChars(number, candidate, phone_util_) ||
481 // Two or more slashes were present.
482 (FindNth(candidate, '/', 2) != string::npos) ||
483 !IsNationalPrefixPresentIfRequired(number)) {
486 ResultCallback4<bool, const PhoneNumberUtil&, const PhoneNumber&,
487 const string&, const vector<string>&>* callback =
488 NewPermanentCallback(&AllNumberGroupsRemainGrouped);
489 bool is_valid = CheckNumberGroupingIsValid(number, candidate, callback);
493 case PhoneNumberMatcher::EXACT_GROUPING: {
494 if (!phone_util_.IsValidNumber(number) ||
495 !ContainsOnlyValidXChars(number, candidate, phone_util_) ||
496 // Two or more slashes were present.
497 (FindNth(candidate, '/', 2) != string::npos) ||
498 !IsNationalPrefixPresentIfRequired(number)) {
501 ResultCallback4<bool, const PhoneNumberUtil&, const PhoneNumber&,
502 const string&, const vector<string>&>* callback =
503 NewPermanentCallback(
504 this, &PhoneNumberMatcher::AllNumberGroupsAreExactlyPresent);
505 bool is_valid = CheckNumberGroupingIsValid(number, candidate, callback);
510 LOG(ERROR) << "No implementation defined for verification for leniency "
511 << static_cast<int>(leniency);
516 bool PhoneNumberMatcher::ExtractInnerMatch(const string& candidate, int offset,
517 PhoneNumberMatch* match) {
519 // Try removing either the first or last "group" in the number and see if this
520 // gives a result. We consider white space to be a possible indication of
521 // the start or end of the phone number.
522 scoped_ptr<RegExpInput> candidate_input(
523 reg_exps_->regexp_factory_->CreateInput(candidate));
524 if (reg_exps_->group_separator_->FindAndConsume(candidate_input.get(),
526 // Try the first group by itself.
527 int group_start_index =
528 candidate.length() - candidate_input->ToString().length();
529 string first_group_only = candidate.substr(0, group_start_index);
530 phone_util_.TrimUnwantedEndChars(&first_group_only);
531 bool success = ParseAndVerify(first_group_only, offset, match);
537 // Try the rest of the candidate without the first group.
538 string without_first_group(candidate_input->ToString());
539 phone_util_.TrimUnwantedEndChars(&without_first_group);
541 ParseAndVerify(without_first_group, offset + group_start_index, match);
547 if (max_tries_ > 0) {
548 while (reg_exps_->group_separator_->FindAndConsume(candidate_input.get(),
550 // Find the last group.
552 int last_group_start =
553 candidate.length() - candidate_input->ToString().length();
554 string without_last_group = candidate.substr(0, last_group_start);
555 phone_util_.TrimUnwantedEndChars(&without_last_group);
556 if (without_last_group == first_group_only) {
557 // If there are only two groups, then the group "without the last group"
558 // is the same as the first group. In these cases, we don't want to
559 // re-check the number group, so we exit already.
562 success = ParseAndVerify(without_last_group, offset, match);
572 bool PhoneNumberMatcher::ExtractMatch(const string& candidate, int offset,
573 PhoneNumberMatch* match) {
575 // Skip a match that is more likely a publication page reference or a date.
576 if (reg_exps_->pub_pages_->PartialMatch(candidate) ||
577 reg_exps_->slash_separated_dates_->PartialMatch(candidate)) {
580 // Skip potential time-stamps.
581 if (reg_exps_->time_stamps_->PartialMatch(candidate)) {
582 scoped_ptr<RegExpInput> following_text(
583 reg_exps_->regexp_factory_->CreateInput(
584 text_.substr(offset + candidate.size())));
585 if (reg_exps_->time_stamps_suffix_->Consume(following_text.get())) {
590 // Try to come up with a valid match given the entire candidate.
591 if (ParseAndVerify(candidate, offset, match)) {
595 // If that failed, try to find an "inner match" - there might be a phone
596 // number within this candidate.
597 return ExtractInnerMatch(candidate, offset, match);
600 bool PhoneNumberMatcher::HasNext() {
601 if (state_ == NOT_READY) {
602 PhoneNumberMatch temp_match;
603 if (!Find(search_index_, &temp_match)) {
606 last_match_.reset(new PhoneNumberMatch(temp_match.start(),
607 temp_match.raw_string(),
608 temp_match.number()));
609 search_index_ = last_match_->end();
613 return state_ == READY;
616 bool PhoneNumberMatcher::Next(PhoneNumberMatch* match) {
618 // Check the state and find the next match as a side-effect if necessary.
622 match->CopyFrom(*last_match_);
624 last_match_.reset(NULL);
628 bool PhoneNumberMatcher::Find(int index, PhoneNumberMatch* match) {
631 scoped_ptr<RegExpInput> text(
632 reg_exps_->regexp_factory_for_pattern_->CreateInput(text_.substr(index)));
634 while ((max_tries_ > 0) &&
635 reg_exps_->pattern_->FindAndConsume(text.get(), &candidate)) {
636 int start = text_.length() - text->ToString().length() - candidate.length();
637 // Check for extra numbers at the end.
638 reg_exps_->capture_up_to_second_number_start_pattern_->
639 PartialMatch(candidate, &candidate);
640 if (ExtractMatch(candidate, start, match)) {
644 index = start + candidate.length();
650 bool PhoneNumberMatcher::CheckNumberGroupingIsValid(
651 const PhoneNumber& phone_number,
652 const string& candidate,
653 ResultCallback4<bool, const PhoneNumberUtil&, const PhoneNumber&,
654 const string&, const vector<string>&>* checker) const {
656 // TODO: Evaluate how this works for other locales (testing has been limited
657 // to NANPA regions) and optimise if necessary.
658 string normalized_candidate =
659 NormalizeUTF8::NormalizeDecimalDigits(candidate);
660 vector<string> formatted_number_groups;
661 GetNationalNumberGroups(phone_number, NULL, // Use default formatting pattern
662 &formatted_number_groups);
663 if (checker->Run(phone_util_, phone_number, normalized_candidate,
664 formatted_number_groups)) {
667 // If this didn't pass, see if there are any alternate formats, and try them
669 const PhoneMetadata* alternate_formats =
670 alternate_formats_->GetAlternateFormatsForCountry(
671 phone_number.country_code());
672 if (alternate_formats) {
673 for (RepeatedPtrField<NumberFormat>::const_iterator it =
674 alternate_formats->number_format().begin();
675 it != alternate_formats->number_format().end(); ++it) {
676 formatted_number_groups.clear();
677 GetNationalNumberGroups(phone_number, &*it, &formatted_number_groups);
678 if (checker->Run(phone_util_, phone_number, normalized_candidate,
679 formatted_number_groups)) {
687 // Helper method to get the national-number part of a number, formatted without
688 // any national prefix, and return it as a set of digit blocks that would be
689 // formatted together.
690 void PhoneNumberMatcher::GetNationalNumberGroups(
691 const PhoneNumber& number,
692 const NumberFormat* formatting_pattern,
693 vector<string>* digit_blocks) const {
694 string rfc3966_format;
695 if (!formatting_pattern) {
696 // This will be in the format +CC-DG;ext=EXT where DG represents groups of
698 phone_util_.Format(number, PhoneNumberUtil::RFC3966, &rfc3966_format);
699 // We remove the extension part from the formatted string before splitting
700 // it into different groups.
701 size_t end_index = rfc3966_format.find(';');
702 if (end_index == string::npos) {
703 end_index = rfc3966_format.length();
705 // The country-code will have a '-' following it.
706 size_t start_index = rfc3966_format.find('-') + 1;
707 SplitStringUsing(rfc3966_format.substr(start_index,
708 end_index - start_index),
711 // We format the NSN only, and split that according to the separator.
712 string national_significant_number;
713 phone_util_.GetNationalSignificantNumber(number,
714 &national_significant_number);
715 phone_util_.FormatNsnUsingPattern(national_significant_number,
717 PhoneNumberUtil::RFC3966,
719 SplitStringUsing(rfc3966_format, "-", digit_blocks);
723 bool PhoneNumberMatcher::IsNationalPrefixPresentIfRequired(
724 const PhoneNumber& number) const {
725 // First, check how we deduced the country code. If it was written in
726 // international format, then the national prefix is not required.
727 if (number.country_code_source() != PhoneNumber::FROM_DEFAULT_COUNTRY) {
730 string phone_number_region;
731 phone_util_.GetRegionCodeForCountryCode(
732 number.country_code(), &phone_number_region);
733 const PhoneMetadata* metadata =
734 phone_util_.GetMetadataForRegion(phone_number_region);
738 // Check if a national prefix should be present when formatting this number.
739 string national_number;
740 phone_util_.GetNationalSignificantNumber(number, &national_number);
741 const NumberFormat* format_rule =
742 phone_util_.ChooseFormattingPatternForNumber(metadata->number_format(),
744 // To do this, we check that a national prefix formatting rule was present and
745 // that it wasn't just the first-group symbol ($1) with punctuation.
746 if (format_rule && !format_rule->national_prefix_formatting_rule().empty()) {
747 if (format_rule->national_prefix_optional_when_formatting()) {
748 // The national-prefix is optional in these cases, so we don't need to
749 // check if it was present.
752 if (phone_util_.FormattingRuleHasFirstGroupOnly(
753 format_rule->national_prefix_formatting_rule())) {
754 // National Prefix not needed for this number.
757 // Normalize the remainder.
758 string raw_input_copy(number.raw_input());
759 // Check if we found a national prefix and/or carrier code at the start of
760 // the raw input, and return the result.
761 phone_util_.NormalizeDigitsOnly(&raw_input_copy);
762 return phone_util_.MaybeStripNationalPrefixAndCarrierCode(
765 NULL); // Don't need to keep the stripped carrier code.
770 bool PhoneNumberMatcher::AllNumberGroupsAreExactlyPresent(
771 const PhoneNumberUtil& util,
772 const PhoneNumber& phone_number,
773 const string& normalized_candidate,
774 const vector<string>& formatted_number_groups) const {
775 const scoped_ptr<RegExpInput> candidate_number(
776 reg_exps_->regexp_factory_->CreateInput(normalized_candidate));
777 vector<string> candidate_groups;
779 while (reg_exps_->capturing_ascii_digits_pattern_->FindAndConsume(
780 candidate_number.get(),
782 candidate_groups.push_back(digit_block);
785 // Set this to the last group, skipping it if the number has an extension.
786 int candidate_number_group_index =
787 phone_number.has_extension() ? candidate_groups.size() - 2
788 : candidate_groups.size() - 1;
789 // First we check if the national significant number is formatted as a block.
790 // We use find and not equals, since the national significant number may be
791 // present with a prefix such as a national number prefix, or the country code
793 string national_significant_number;
794 util.GetNationalSignificantNumber(phone_number,
795 &national_significant_number);
796 if (candidate_groups.size() == 1 ||
797 candidate_groups.at(candidate_number_group_index).find(
798 national_significant_number) != string::npos) {
801 // Starting from the end, go through in reverse, excluding the first group,
802 // and check the candidate and number groups are the same.
803 for (int formatted_number_group_index =
804 (formatted_number_groups.size() - 1);
805 formatted_number_group_index > 0 &&
806 candidate_number_group_index >= 0;
807 --formatted_number_group_index, --candidate_number_group_index) {
808 if (candidate_groups.at(candidate_number_group_index) !=
809 formatted_number_groups.at(formatted_number_group_index)) {
813 // Now check the first group. There may be a national prefix at the start, so
814 // we only check that the candidate group ends with the formatted number
816 return (candidate_number_group_index >= 0 &&
817 HasSuffixString(candidate_groups.at(candidate_number_group_index),
818 formatted_number_groups.at(0)));
821 } // namespace phonenumbers