1 // Copyright (C) 2011 The Libphonenumber Authors
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
7 // http://www.apache.org/licenses/LICENSE-2.0
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
15 // Author: Lara Rennie
18 // Implementation of a stateful class that finds and extracts telephone numbers
21 #include "phonenumbers/phonenumbermatcher.h"
23 #ifndef I18N_PHONENUMBERS_USE_ICU_REGEXP
24 #error phonenumbermatcher depends on ICU \
25 (i.e. I18N_PHONENUMBERS_USE_ICU_REGEXP must be set)
26 #endif // I18N_PHONENUMBERS_USE_ICU_REGEXP
36 #include <unicode/uchar.h>
38 #include "phonenumbers/alternate_format.h"
39 #include "phonenumbers/base/logging.h"
40 #include "phonenumbers/base/memory/scoped_ptr.h"
41 #include "phonenumbers/base/memory/singleton.h"
42 #include "phonenumbers/callback.h"
43 #include "phonenumbers/default_logger.h"
44 #include "phonenumbers/encoding_utils.h"
45 #include "phonenumbers/normalize_utf8.h"
46 #include "phonenumbers/phonemetadata.pb.h"
47 #include "phonenumbers/phonenumber.pb.h"
48 #include "phonenumbers/phonenumbermatch.h"
49 #include "phonenumbers/phonenumberutil.h"
50 #include "phonenumbers/regexp_adapter.h"
51 #include "phonenumbers/regexp_adapter_icu.h"
52 #include "phonenumbers/stringutil.h"
54 #ifdef I18N_PHONENUMBERS_USE_RE2
55 #include "phonenumbers/regexp_adapter_re2.h"
56 #endif // I18N_PHONENUMBERS_USE_RE2_AND_ICU
59 using std::numeric_limits;
64 namespace phonenumbers {
67 // Returns a regular expression quantifier with an upper and lower limit.
68 string Limit(int lower, int upper) {
71 DCHECK_LT(lower, upper);
72 return StrCat("{", lower, ",", upper, "}");
75 bool IsInvalidPunctuationSymbol(char32 character) {
76 return character == '%' || u_charType(character) == U_CURRENCY_SYMBOL;
79 bool ContainsOnlyValidXChars(const PhoneNumber& number, const string& candidate,
80 const PhoneNumberUtil& util) {
81 // The characters 'x' and 'X' can be (1) a carrier code, in which case they
82 // always precede the national significant number or (2) an extension sign,
83 // in which case they always precede the extension number. We assume a
84 // carrier code is more than 1 digit, so the first case has to have more than
85 // 1 consecutive 'x' or 'X', whereas the second case can only have exactly 1
88 found = candidate.find_first_of("xX");
89 // We ignore the character if 'x' or 'X' appears as the last character of
91 while (found != string::npos && found < candidate.length() - 1) {
92 // We only look for 'x' or 'X' in ASCII form.
93 char next_char = candidate[found + 1];
94 if (next_char == 'x' || next_char == 'X') {
95 // This is the carrier code case, in which the 'X's always precede the
96 // national significant number.
98 if (util.IsNumberMatchWithOneString(
99 number, candidate.substr(found, candidate.length() - found))
100 != PhoneNumberUtil::NSN_MATCH) {
104 string normalized_extension(candidate.substr(found,
105 candidate.length() - found));
106 util.NormalizeDigitsOnly(&normalized_extension);
107 if (normalized_extension != number.extension()) {
111 found = candidate.find_first_of("xX", found + 1);
116 bool AllNumberGroupsRemainGrouped(
117 const PhoneNumberUtil& util,
118 const PhoneNumber& number,
119 const string& normalized_candidate,
120 const vector<string>& formatted_number_groups) {
121 size_t from_index = 0;
122 if (number.country_code_source() != PhoneNumber::FROM_DEFAULT_COUNTRY) {
123 // First skip the country code if the normalized candidate contained it.
124 string country_code = SimpleItoa(number.country_code());
125 from_index = normalized_candidate.find(country_code) + country_code.size();
127 // Check each group of consecutive digits are not broken into separate
128 // groupings in the normalized_candidate string.
129 for (size_t i = 0; i < formatted_number_groups.size(); ++i) {
130 // Fails if the substring of normalized_candidate starting from from_index
131 // doesn't contain the consecutive digits in formatted_number_groups.at(i).
132 from_index = normalized_candidate.find(formatted_number_groups.at(i),
134 if (from_index == string::npos) {
137 // Moves from_index forward.
138 from_index += formatted_number_groups.at(i).length();
139 if (i == 0 && from_index < normalized_candidate.length()) {
140 // We are at the position right after the NDC. We get the region used for
141 // formatting information based on the country code in the phone number,
142 // rather than the number itself, as we do not need to distinguish between
143 // different countries with the same country calling code and this is
146 util.GetRegionCodeForCountryCode(number.country_code(), ®ion);
148 util.GetNddPrefixForRegion(region, true, &ndd_prefix);
149 // Note although normalized_candidate might contain non-ASCII formatting
150 // characters, they won't be treated as ASCII digits when converted to a
152 if (!ndd_prefix.empty() && isdigit(normalized_candidate.at(from_index))) {
153 // This means there is no formatting symbol after the NDC. In this case,
154 // we only accept the number if there is no formatting symbol at all in
155 // the number, except for extensions. This is only important for
156 // countries with national prefixes.
157 string national_significant_number;
158 util.GetNationalSignificantNumber(number, &national_significant_number);
159 return HasPrefixString(normalized_candidate.substr(
160 from_index - formatted_number_groups.at(i).length()),
161 national_significant_number);
165 // The check here makes sure that we haven't mistakenly already used the
166 // extension to match the last group of the subscriber number. Note the
167 // extension cannot have formatting in-between digits.
168 return normalized_candidate.substr(from_index)
169 .find(number.extension()) != string::npos;
172 bool LoadAlternateFormats(PhoneMetadataCollection* alternate_formats) {
173 #if defined(I18N_PHONENUMBERS_USE_ALTERNATE_FORMATS)
174 if (!alternate_formats->ParseFromArray(alternate_format_get(),
175 alternate_format_size())) {
176 LOG(ERROR) << "Could not parse binary data.";
187 class PhoneNumberMatcherRegExps : public Singleton<PhoneNumberMatcherRegExps> {
189 friend class Singleton<PhoneNumberMatcherRegExps>;
191 string opening_parens_;
192 string closing_parens_;
194 // Limit on the number of pairs of brackets in a phone number.
195 string bracket_pair_limit_;
196 // Helper strings for the matching_brackets_ pattern.
197 // An opening bracket at the beginning may not be closed, but subsequent ones
198 // should be. It's also possible that the leading bracket was dropped, so we
199 // shouldn't be surprised if we see a closing bracket first.
200 string leading_maybe_matched_bracket_;
201 string bracket_pairs_;
202 // Limit on the number of leading (plus) characters.
204 // Limit on the number of consecutive punctuation characters.
205 string punctuation_limit_;
206 // The maximum number of digits allowed in a digit-separated block. As we
207 // allow all digits in a single block, this should be set high enough to
208 // accommodate the entire national number and the international country code.
209 int digit_block_limit_;
210 // Limit on the number of blocks separated by punctuation. Uses
211 // kDigitBlockLimit since some formats use spaces to separate each digit.
213 // A punctuation sequence allowing white space.
215 // A digits block without punctuation.
216 string digit_sequence_;
217 // Punctuation that may be at the start of a phone number - brackets and plus
219 string lead_class_chars_;
220 // Same as lead_class_chars_, but enclosed as a character class.
222 // Extra helper strings that form part of pattern_. These are stored
223 // separately since StrCat has a limit of 12 args.
224 string opening_punctuation_;
225 string optional_extn_pattern_;
228 // We use two different reg-ex factories here for performance reasons. RE2 is
229 // much faster for smaller reg-ex patterns, but the main pattern cannot be
230 // handled by RE2 in an efficient way.
231 scoped_ptr<const AbstractRegExpFactory> regexp_factory_for_pattern_;
232 scoped_ptr<const AbstractRegExpFactory> regexp_factory_;
234 // Matches strings that look like publication pages. Example:
235 // Computing Complete Answers to Queries in the Presence of Limited Access
236 // Patterns. Chen Li. VLDB J. 12(3): 211-227 (2003).
238 // The string "211-227 (2003)" is not a telephone number.
239 scoped_ptr<const RegExp> pub_pages_;
240 // Matches strings that look like dates using "/" as a separator. Examples:
241 // 3/10/2011, 31/10/96 or 08/31/95.
242 scoped_ptr<const RegExp> slash_separated_dates_;
243 // Matches timestamps. Examples: "2012-01-02 08:00". Note that the reg-ex does
244 // not include trailing ":\d\d" -- that is covered by time_stamps_suffix_.
245 scoped_ptr<const RegExp> time_stamps_;
246 scoped_ptr<const RegExp> time_stamps_suffix_;
247 // Pattern to check that brackets match. Opening brackets should be closed
248 // within a phone number. This also checks that there is something inside the
249 // brackets. Having no brackets at all is also fine.
250 scoped_ptr<const RegExp> matching_brackets_;
251 // Patterns used to extract phone numbers from a larger phone-number-like
252 // pattern. These are ordered according to specificity. For example,
253 // white-space is last since that is frequently used in numbers, not just to
254 // separate two numbers. We have separate patterns since we don't want to
255 // break up the phone-number-like text on more than one different kind of
256 // symbol at one time, although symbols of the same type (e.g. space) can be
257 // safely grouped together.
259 // Note that if there is a match, we will always check any text found up to
260 // the first match as well.
261 scoped_ptr<vector<const RegExp*> > inner_matches_;
262 scoped_ptr<const RegExp> capture_up_to_second_number_start_pattern_;
263 scoped_ptr<const RegExp> capturing_ascii_digits_pattern_;
264 // Compiled reg-ex representing lead_class_;
265 scoped_ptr<const RegExp> lead_class_pattern_;
266 // Phone number pattern allowing optional punctuation.
267 scoped_ptr<const RegExp> pattern_;
269 PhoneNumberMatcherRegExps()
270 : opening_parens_("(\\[\xEF\xBC\x88\xEF\xBC\xBB" /* "(\\[([" */),
271 closing_parens_(")\\]\xEF\xBC\x89\xEF\xBC\xBD" /* ")\\])]" */),
272 non_parens_(StrCat("[^", opening_parens_, closing_parens_, "]")),
273 bracket_pair_limit_(Limit(0, 3)),
274 leading_maybe_matched_bracket_(StrCat(
275 "(?:[", opening_parens_, "])?",
276 "(?:", non_parens_, "+[", closing_parens_, "])?")),
277 bracket_pairs_(StrCat(
278 "(?:[", opening_parens_, "]", non_parens_, "+",
279 "[", closing_parens_, "])", bracket_pair_limit_)),
280 lead_limit_(Limit(0, 2)),
281 punctuation_limit_(Limit(0, 4)),
282 digit_block_limit_(PhoneNumberUtil::kMaxLengthForNsn +
283 PhoneNumberUtil::kMaxLengthCountryCode),
284 block_limit_(Limit(0, digit_block_limit_)),
285 punctuation_(StrCat("[", PhoneNumberUtil::kValidPunctuation, "]",
286 punctuation_limit_)),
287 digit_sequence_(StrCat("\\p{Nd}", Limit(1, digit_block_limit_))),
288 lead_class_chars_(StrCat(opening_parens_, PhoneNumberUtil::kPlusChars)),
289 lead_class_(StrCat("[", lead_class_chars_, "]")),
290 opening_punctuation_(StrCat("(?:", lead_class_, punctuation_, ")")),
291 optional_extn_pattern_(StrCat(
293 PhoneNumberUtil::GetInstance()->GetExtnPatternsForMatching(),
295 regexp_factory_for_pattern_(new ICURegExpFactory()),
296 #ifdef I18N_PHONENUMBERS_USE_RE2
297 regexp_factory_(new RE2RegExpFactory()),
299 regexp_factory_(new ICURegExpFactory()),
300 #endif // I18N_PHONENUMBERS_USE_RE2
301 pub_pages_(regexp_factory_->CreateRegExp(
302 "\\d{1,5}-+\\d{1,5}\\s{0,4}\\(\\d{1,4}")),
303 slash_separated_dates_(regexp_factory_->CreateRegExp(
304 "(?:(?:[0-3]?\\d/[01]?\\d)|"
305 "(?:[01]?\\d/[0-3]?\\d))/(?:[12]\\d)?\\d{2}")),
306 time_stamps_(regexp_factory_->CreateRegExp(
307 "[12]\\d{3}[-/]?[01]\\d[-/]?[0-3]\\d +[0-2]\\d$")),
308 time_stamps_suffix_(regexp_factory_->CreateRegExp(":[0-5]\\d")),
309 matching_brackets_(regexp_factory_->CreateRegExp(
310 StrCat(leading_maybe_matched_bracket_, non_parens_, "+",
311 bracket_pairs_, non_parens_, "*"))),
312 inner_matches_(new vector<const RegExp*>()),
313 capture_up_to_second_number_start_pattern_(
314 regexp_factory_->CreateRegExp(
315 PhoneNumberUtil::kCaptureUpToSecondNumberStart)),
316 capturing_ascii_digits_pattern_(
317 regexp_factory_->CreateRegExp("(\\d+)")),
318 lead_class_pattern_(regexp_factory_->CreateRegExp(lead_class_)),
319 pattern_(regexp_factory_for_pattern_->CreateRegExp(
320 StrCat("(", opening_punctuation_, lead_limit_,
321 digit_sequence_, "(?:", punctuation_, digit_sequence_, ")",
322 block_limit_, optional_extn_pattern_, ")"))) {
323 inner_matches_->push_back(
324 // Breaks on the slash - e.g. "651-234-2345/332-445-1234"
325 regexp_factory_->CreateRegExp("/+(.*)"));
326 inner_matches_->push_back(
327 // Note that the bracket here is inside the capturing group, since we
328 // consider it part of the phone number. Will match a pattern like
329 // "(650) 223 3345 (754) 223 3321".
330 regexp_factory_->CreateRegExp("(\\([^(]*)"));
331 inner_matches_->push_back(
332 // Breaks on a hyphen - e.g. "12345 - 332-445-1234 is my number." We
333 // require a space on either side of the hyphen for it to be considered
335 regexp_factory_->CreateRegExp("(?:\\p{Z}-|-\\p{Z})\\p{Z}*(.+)"));
336 inner_matches_->push_back(
337 // Various types of wide hyphens. Note we have decided not to enforce a
338 // space here, since it's possible that it's supposed to be used to
339 // break two numbers without spaces, and we haven't seen many instances
340 // of it used within a number.
341 regexp_factory_->CreateRegExp(
342 "[\xE2\x80\x92-\x2D\xE2\x80\x95\xEF\xBC\x8D]" /* "‒-―-" */
344 inner_matches_->push_back(
345 // Breaks on a full stop - e.g. "12345. 332-445-1234 is my number."
346 regexp_factory_->CreateRegExp("\\.+\\p{Z}*([^.]+)"));
347 inner_matches_->push_back(
348 // Breaks on space - e.g. "3324451234 8002341234"
349 regexp_factory_->CreateRegExp("\\p{Z}+(\\P{Z}+)"));
353 DISALLOW_COPY_AND_ASSIGN(PhoneNumberMatcherRegExps);
356 class AlternateFormats : public Singleton<AlternateFormats> {
358 PhoneMetadataCollection format_data_;
360 map<int, const PhoneMetadata*> calling_code_to_alternate_formats_map_;
364 calling_code_to_alternate_formats_map_() {
365 if (!LoadAlternateFormats(&format_data_)) {
366 LOG(DFATAL) << "Could not parse compiled-in metadata.";
369 for (RepeatedPtrField<PhoneMetadata>::const_iterator it =
370 format_data_.metadata().begin();
371 it != format_data_.metadata().end();
373 calling_code_to_alternate_formats_map_.insert(
374 std::make_pair(it->country_code(), &*it));
378 const PhoneMetadata* GetAlternateFormatsForCountry(int country_calling_code)
380 map<int, const PhoneMetadata*>::const_iterator it =
381 calling_code_to_alternate_formats_map_.find(country_calling_code);
382 if (it != calling_code_to_alternate_formats_map_.end()) {
389 DISALLOW_COPY_AND_ASSIGN(AlternateFormats);
392 PhoneNumberMatcher::PhoneNumberMatcher(const PhoneNumberUtil& util,
394 const string& region_code,
395 PhoneNumberMatcher::Leniency leniency,
397 : reg_exps_(PhoneNumberMatcherRegExps::GetInstance()),
398 alternate_formats_(AlternateFormats::GetInstance()),
401 preferred_region_(region_code),
403 max_tries_(max_tries),
409 PhoneNumberMatcher::PhoneNumberMatcher(const string& text,
410 const string& region_code)
411 : reg_exps_(PhoneNumberMatcherRegExps::GetInstance()),
412 alternate_formats_(NULL), // Not used.
413 phone_util_(*PhoneNumberUtil::GetInstance()),
415 preferred_region_(region_code),
417 max_tries_(numeric_limits<int>::max()),
423 PhoneNumberMatcher::~PhoneNumberMatcher() {
427 bool PhoneNumberMatcher::IsLatinLetter(char32 letter) {
428 // Combining marks are a subset of non-spacing-mark.
429 if (!u_isalpha(letter) && (u_charType(letter) != U_NON_SPACING_MARK)) {
432 UBlockCode block = ublock_getCode(letter);
433 return ((block == UBLOCK_BASIC_LATIN) ||
434 (block == UBLOCK_LATIN_1_SUPPLEMENT) ||
435 (block == UBLOCK_LATIN_EXTENDED_A) ||
436 (block == UBLOCK_LATIN_EXTENDED_ADDITIONAL) ||
437 (block == UBLOCK_LATIN_EXTENDED_B) ||
438 (block == UBLOCK_COMBINING_DIACRITICAL_MARKS));
441 bool PhoneNumberMatcher::ParseAndVerify(const string& candidate, int offset,
442 PhoneNumberMatch* match) {
444 // Check the candidate doesn't contain any formatting which would indicate
445 // that it really isn't a phone number.
446 if (!reg_exps_->matching_brackets_->FullMatch(candidate) ||
447 reg_exps_->pub_pages_->PartialMatch(candidate)) {
451 // If leniency is set to VALID or stricter, we also want to skip numbers that
452 // are surrounded by Latin alphabetic characters, to skip cases like
453 // abc8005001234 or 8005001234def.
454 if (leniency_ >= VALID) {
455 // If the candidate is not at the start of the text, and does not start with
456 // phone-number punctuation, check the previous character.
457 scoped_ptr<RegExpInput> candidate_input(
458 reg_exps_->regexp_factory_->CreateInput(candidate));
460 !reg_exps_->lead_class_pattern_->Consume(candidate_input.get())) {
461 char32 previous_char;
462 const char* previous_char_ptr =
463 EncodingUtils::BackUpOneUTF8Character(text_.c_str(),
464 text_.c_str() + offset);
465 EncodingUtils::DecodeUTF8Char(previous_char_ptr, &previous_char);
466 // We return false if it is a latin letter or an invalid punctuation
468 if (IsInvalidPunctuationSymbol(previous_char) ||
469 IsLatinLetter(previous_char)) {
473 size_t lastCharIndex = offset + candidate.length();
474 if (lastCharIndex < text_.length()) {
476 const char* next_char_ptr =
477 EncodingUtils::AdvanceOneUTF8Character(
478 text_.c_str() + lastCharIndex - 1);
479 EncodingUtils::DecodeUTF8Char(next_char_ptr, &next_char);
480 if (IsInvalidPunctuationSymbol(next_char) || IsLatinLetter(next_char)) {
487 if (phone_util_.ParseAndKeepRawInput(candidate, preferred_region_, &number) !=
488 PhoneNumberUtil::NO_PARSING_ERROR) {
493 // Check Israel * numbers: these are a special case in that they are
494 // four-digit numbers that our library supports, but they can only be dialled
495 // with a leading *. Since we don't actually store or detect the * in our
496 // phone number library, this means in practice we detect most four digit
497 // numbers as being valid for Israel. We are considering moving these numbers
498 // to ShortNumberInfo instead, in which case this problem would go away, but
499 // in the meantime we want to restrict the false matches so we only allow
500 // these numbers if they are preceded by a star. We enforce this for all
501 // leniency levels even though these numbers are technically accepted by
502 // isPossibleNumber and isValidNumber since we consider it to be a deficiency
503 // in those methods that they accept these numbers without the *.
504 // TODO: Remove this or make it significantly less hacky once
505 // we've decided how to handle these short codes going forward in
506 // ShortNumberInfo. We could use the formatting rules for instance, but that
509 phone_util_.GetRegionCodeForCountryCode(number.country_code(), ®ion_code);
510 if (region_code == "IL") {
511 string national_number;
512 phone_util_.GetNationalSignificantNumber(number, &national_number);
513 if (national_number.length() == 4 &&
514 // Just check the previous char, since * is an ASCII character.
515 (offset == 0 || (offset > 0 && text_[offset - 1] != '*'))) {
521 if (VerifyAccordingToLeniency(leniency_, number, candidate)) {
522 match->set_start(offset);
523 match->set_raw_string(candidate);
524 // We used ParseAndKeepRawInput to create this number, but for now we don't
525 // return the extra values parsed. TODO: stop clearing all values here and
526 // switch all users over to using raw_input() rather than the raw_string()
527 // of PhoneNumberMatch.
528 number.clear_country_code_source();
529 number.clear_preferred_domestic_carrier_code();
530 number.clear_raw_input();
531 match->set_number(number);
537 // Helper method to replace the verification method for each enum in the Java
539 bool PhoneNumberMatcher::VerifyAccordingToLeniency(
540 Leniency leniency, const PhoneNumber& number,
541 const string& candidate) const {
543 case PhoneNumberMatcher::POSSIBLE:
544 return phone_util_.IsPossibleNumber(number);
545 case PhoneNumberMatcher::VALID:
546 if (!phone_util_.IsValidNumber(number) ||
547 !ContainsOnlyValidXChars(number, candidate, phone_util_)) {
550 return IsNationalPrefixPresentIfRequired(number);
551 case PhoneNumberMatcher::STRICT_GROUPING: {
552 if (!phone_util_.IsValidNumber(number) ||
553 !ContainsOnlyValidXChars(number, candidate, phone_util_) ||
554 ContainsMoreThanOneSlashInNationalNumber(
555 number, candidate, phone_util_) ||
556 !IsNationalPrefixPresentIfRequired(number)) {
559 ResultCallback4<bool, const PhoneNumberUtil&, const PhoneNumber&,
560 const string&, const vector<string>&>* callback =
561 NewPermanentCallback(&AllNumberGroupsRemainGrouped);
562 bool is_valid = CheckNumberGroupingIsValid(number, candidate, callback);
566 case PhoneNumberMatcher::EXACT_GROUPING: {
567 if (!phone_util_.IsValidNumber(number) ||
568 !ContainsOnlyValidXChars(number, candidate, phone_util_) ||
569 ContainsMoreThanOneSlashInNationalNumber(
570 number, candidate, phone_util_) ||
571 !IsNationalPrefixPresentIfRequired(number)) {
574 ResultCallback4<bool, const PhoneNumberUtil&, const PhoneNumber&,
575 const string&, const vector<string>&>* callback =
576 NewPermanentCallback(
577 this, &PhoneNumberMatcher::AllNumberGroupsAreExactlyPresent);
578 bool is_valid = CheckNumberGroupingIsValid(number, candidate, callback);
583 LOG(ERROR) << "No implementation defined for verification for leniency "
584 << static_cast<int>(leniency);
589 bool PhoneNumberMatcher::ExtractInnerMatch(const string& candidate, int offset,
590 PhoneNumberMatch* match) {
592 for (vector<const RegExp*>::const_iterator regex =
593 reg_exps_->inner_matches_->begin();
594 regex != reg_exps_->inner_matches_->end(); regex++) {
595 scoped_ptr<RegExpInput> candidate_input(
596 reg_exps_->regexp_factory_->CreateInput(candidate));
597 bool is_first_match = true;
599 while ((*regex)->FindAndConsume(candidate_input.get(), &group) &&
601 int group_start_index = candidate.length() -
602 candidate_input->ToString().length() - group.length();
603 if (is_first_match) {
604 // We should handle any group before this one too.
605 string first_group_only = candidate.substr(0, group_start_index);
606 phone_util_.TrimUnwantedEndChars(&first_group_only);
607 bool success = ParseAndVerify(first_group_only, offset, match);
612 is_first_match = false;
614 phone_util_.TrimUnwantedEndChars(&group);
615 bool success = ParseAndVerify(group, offset + group_start_index, match);
625 bool PhoneNumberMatcher::ExtractMatch(const string& candidate, int offset,
626 PhoneNumberMatch* match) {
628 // Skip a match that is more likely to be a date.
629 if (reg_exps_->slash_separated_dates_->PartialMatch(candidate)) {
633 // Skip potential time-stamps.
634 if (reg_exps_->time_stamps_->PartialMatch(candidate)) {
635 scoped_ptr<RegExpInput> following_text(
636 reg_exps_->regexp_factory_->CreateInput(
637 text_.substr(offset + candidate.size())));
638 if (reg_exps_->time_stamps_suffix_->Consume(following_text.get())) {
643 // Try to come up with a valid match given the entire candidate.
644 if (ParseAndVerify(candidate, offset, match)) {
648 // If that failed, try to find an "inner match" - there might be a phone
649 // number within this candidate.
650 return ExtractInnerMatch(candidate, offset, match);
653 bool PhoneNumberMatcher::HasNext() {
654 if (state_ == NOT_READY) {
655 PhoneNumberMatch temp_match;
656 if (!Find(search_index_, &temp_match)) {
659 last_match_.reset(new PhoneNumberMatch(temp_match.start(),
660 temp_match.raw_string(),
661 temp_match.number()));
662 search_index_ = last_match_->end();
666 return state_ == READY;
669 bool PhoneNumberMatcher::Next(PhoneNumberMatch* match) {
671 // Check the state and find the next match as a side-effect if necessary.
675 match->CopyFrom(*last_match_);
677 last_match_.reset(NULL);
681 bool PhoneNumberMatcher::Find(int index, PhoneNumberMatch* match) {
684 scoped_ptr<RegExpInput> text(
685 reg_exps_->regexp_factory_for_pattern_->CreateInput(text_.substr(index)));
687 while ((max_tries_ > 0) &&
688 reg_exps_->pattern_->FindAndConsume(text.get(), &candidate)) {
689 int start = text_.length() - text->ToString().length() - candidate.length();
690 // Check for extra numbers at the end.
691 reg_exps_->capture_up_to_second_number_start_pattern_->
692 PartialMatch(candidate, &candidate);
693 if (ExtractMatch(candidate, start, match)) {
697 index = start + candidate.length();
703 bool PhoneNumberMatcher::CheckNumberGroupingIsValid(
704 const PhoneNumber& phone_number,
705 const string& candidate,
706 ResultCallback4<bool, const PhoneNumberUtil&, const PhoneNumber&,
707 const string&, const vector<string>&>* checker) const {
709 // TODO: Evaluate how this works for other locales (testing has been limited
710 // to NANPA regions) and optimise if necessary.
711 string normalized_candidate =
712 NormalizeUTF8::NormalizeDecimalDigits(candidate);
713 vector<string> formatted_number_groups;
714 GetNationalNumberGroups(phone_number, NULL, // Use default formatting pattern
715 &formatted_number_groups);
716 if (checker->Run(phone_util_, phone_number, normalized_candidate,
717 formatted_number_groups)) {
720 // If this didn't pass, see if there are any alternate formats, and try them
722 const PhoneMetadata* alternate_formats =
723 alternate_formats_->GetAlternateFormatsForCountry(
724 phone_number.country_code());
725 if (alternate_formats) {
726 for (RepeatedPtrField<NumberFormat>::const_iterator it =
727 alternate_formats->number_format().begin();
728 it != alternate_formats->number_format().end(); ++it) {
729 formatted_number_groups.clear();
730 GetNationalNumberGroups(phone_number, &*it, &formatted_number_groups);
731 if (checker->Run(phone_util_, phone_number, normalized_candidate,
732 formatted_number_groups)) {
740 // Helper method to get the national-number part of a number, formatted without
741 // any national prefix, and return it as a set of digit blocks that would be
742 // formatted together.
743 void PhoneNumberMatcher::GetNationalNumberGroups(
744 const PhoneNumber& number,
745 const NumberFormat* formatting_pattern,
746 vector<string>* digit_blocks) const {
747 string rfc3966_format;
748 if (!formatting_pattern) {
749 // This will be in the format +CC-DG;ext=EXT where DG represents groups of
751 phone_util_.Format(number, PhoneNumberUtil::RFC3966, &rfc3966_format);
752 // We remove the extension part from the formatted string before splitting
753 // it into different groups.
754 size_t end_index = rfc3966_format.find(';');
755 if (end_index == string::npos) {
756 end_index = rfc3966_format.length();
758 // The country-code will have a '-' following it.
759 size_t start_index = rfc3966_format.find('-') + 1;
760 SplitStringUsing(rfc3966_format.substr(start_index,
761 end_index - start_index),
764 // We format the NSN only, and split that according to the separator.
765 string national_significant_number;
766 phone_util_.GetNationalSignificantNumber(number,
767 &national_significant_number);
768 phone_util_.FormatNsnUsingPattern(national_significant_number,
770 PhoneNumberUtil::RFC3966,
772 SplitStringUsing(rfc3966_format, "-", digit_blocks);
776 bool PhoneNumberMatcher::IsNationalPrefixPresentIfRequired(
777 const PhoneNumber& number) const {
778 // First, check how we deduced the country code. If it was written in
779 // international format, then the national prefix is not required.
780 if (number.country_code_source() != PhoneNumber::FROM_DEFAULT_COUNTRY) {
783 string phone_number_region;
784 phone_util_.GetRegionCodeForCountryCode(
785 number.country_code(), &phone_number_region);
786 const PhoneMetadata* metadata =
787 phone_util_.GetMetadataForRegion(phone_number_region);
791 // Check if a national prefix should be present when formatting this number.
792 string national_number;
793 phone_util_.GetNationalSignificantNumber(number, &national_number);
794 const NumberFormat* format_rule =
795 phone_util_.ChooseFormattingPatternForNumber(metadata->number_format(),
797 // To do this, we check that a national prefix formatting rule was present and
798 // that it wasn't just the first-group symbol ($1) with punctuation.
799 if (format_rule && !format_rule->national_prefix_formatting_rule().empty()) {
800 if (format_rule->national_prefix_optional_when_formatting()) {
801 // The national-prefix is optional in these cases, so we don't need to
802 // check if it was present.
805 if (phone_util_.FormattingRuleHasFirstGroupOnly(
806 format_rule->national_prefix_formatting_rule())) {
807 // National Prefix not needed for this number.
810 // Normalize the remainder.
811 string raw_input_copy(number.raw_input());
812 // Check if we found a national prefix and/or carrier code at the start of
813 // the raw input, and return the result.
814 phone_util_.NormalizeDigitsOnly(&raw_input_copy);
815 return phone_util_.MaybeStripNationalPrefixAndCarrierCode(
818 NULL); // Don't need to keep the stripped carrier code.
823 bool PhoneNumberMatcher::AllNumberGroupsAreExactlyPresent(
824 const PhoneNumberUtil& util,
825 const PhoneNumber& phone_number,
826 const string& normalized_candidate,
827 const vector<string>& formatted_number_groups) const {
828 const scoped_ptr<RegExpInput> candidate_number(
829 reg_exps_->regexp_factory_->CreateInput(normalized_candidate));
830 vector<string> candidate_groups;
832 while (reg_exps_->capturing_ascii_digits_pattern_->FindAndConsume(
833 candidate_number.get(),
835 candidate_groups.push_back(digit_block);
838 // Set this to the last group, skipping it if the number has an extension.
839 int candidate_number_group_index =
840 phone_number.has_extension() ? candidate_groups.size() - 2
841 : candidate_groups.size() - 1;
842 // First we check if the national significant number is formatted as a block.
843 // We use find and not equals, since the national significant number may be
844 // present with a prefix such as a national number prefix, or the country code
846 string national_significant_number;
847 util.GetNationalSignificantNumber(phone_number,
848 &national_significant_number);
849 if (candidate_groups.size() == 1 ||
850 candidate_groups.at(candidate_number_group_index).find(
851 national_significant_number) != string::npos) {
854 // Starting from the end, go through in reverse, excluding the first group,
855 // and check the candidate and number groups are the same.
856 for (int formatted_number_group_index =
857 (formatted_number_groups.size() - 1);
858 formatted_number_group_index > 0 &&
859 candidate_number_group_index >= 0;
860 --formatted_number_group_index, --candidate_number_group_index) {
861 if (candidate_groups.at(candidate_number_group_index) !=
862 formatted_number_groups.at(formatted_number_group_index)) {
866 // Now check the first group. There may be a national prefix at the start, so
867 // we only check that the candidate group ends with the formatted number
869 return (candidate_number_group_index >= 0 &&
870 HasSuffixString(candidate_groups.at(candidate_number_group_index),
871 formatted_number_groups.at(0)));
875 bool PhoneNumberMatcher::ContainsMoreThanOneSlashInNationalNumber(
876 const PhoneNumber& number,
877 const string& candidate,
878 const PhoneNumberUtil& util) {
879 size_t first_slash_in_body = candidate.find('/');
880 if (first_slash_in_body == string::npos) {
881 // No slashes, this is okay.
884 // Now look for a second one.
885 size_t second_slash_in_body = candidate.find('/', first_slash_in_body + 1);
886 if (second_slash_in_body == string::npos) {
887 // Only one slash, this is okay.
891 // If the first slash is after the country calling code, this is permitted.
892 if (number.country_code_source() == PhoneNumber::FROM_NUMBER_WITH_PLUS_SIGN ||
893 number.country_code_source() ==
894 PhoneNumber::FROM_NUMBER_WITHOUT_PLUS_SIGN) {
895 string normalized_country_code =
896 candidate.substr(0, first_slash_in_body);
897 util.NormalizeDigitsOnly(&normalized_country_code);
898 if (normalized_country_code == SimpleItoa(number.country_code())) {
899 // Any more slashes and this is illegal.
900 return candidate.find('/', second_slash_in_body + 1) != string::npos;
906 } // namespace phonenumbers