Upstream version 5.34.104.0
[platform/framework/web/crosswalk.git] / src / chrome / renderer / safe_browsing / phishing_url_feature_extractor.h
1 // Copyright (c) 2010 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 //
5 // PhishingUrlFeatureExtractor handles computing URL-based features for
6 // the client-side phishing detection model.  These include tokens in the
7 // host and path, features pertaining to host length, and IP addresses.
8
9 #ifndef CHROME_RENDERER_SAFE_BROWSING_PHISHING_URL_FEATURE_EXTRACTOR_H_
10 #define CHROME_RENDERER_SAFE_BROWSING_PHISHING_URL_FEATURE_EXTRACTOR_H_
11
12 #include <string>
13 #include <vector>
14
15 #include "base/basictypes.h"
16
17 class GURL;
18
19 namespace safe_browsing {
20 class FeatureMap;
21
22 class PhishingUrlFeatureExtractor {
23  public:
24   PhishingUrlFeatureExtractor();
25   ~PhishingUrlFeatureExtractor();
26
27   // Extracts features for |url| into the given feature map.
28   // Returns true on success.
29   bool ExtractFeatures(const GURL& url, FeatureMap* features);
30
31  private:
32   friend class PhishingUrlFeatureExtractorTest;
33
34   static const size_t kMinPathComponentLength = 3;
35
36   // Given a string, finds all substrings of consecutive alphanumeric
37   // characters of length >= kMinPathComponentLength and inserts them into
38   // tokens.
39   static void SplitStringIntoLongAlphanumTokens(
40       const std::string& full,
41       std::vector<std::string>* tokens);
42
43   DISALLOW_COPY_AND_ASSIGN(PhishingUrlFeatureExtractor);
44 };
45
46 }  // namespace safe_browsing
47
48 #endif  // CHROME_RENDERER_SAFE_BROWSING_PHISHING_URL_FEATURE_EXTRACTOR_H_