- add sources.
[platform/framework/web/crosswalk.git] / src / chrome / renderer / safe_browsing / phishing_term_feature_extractor.cc
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "chrome/renderer/safe_browsing/phishing_term_feature_extractor.h"
6
7 #include <list>
8 #include <map>
9
10 #include "base/bind.h"
11 #include "base/compiler_specific.h"
12 #include "base/i18n/case_conversion.h"
13 #include "base/logging.h"
14 #include "base/message_loop/message_loop.h"
15 #include "base/metrics/histogram.h"
16 #include "base/strings/utf_string_conversions.h"
17 #include "base/time/time.h"
18 #include "chrome/renderer/safe_browsing/feature_extractor_clock.h"
19 #include "chrome/renderer/safe_browsing/features.h"
20 #include "chrome/renderer/safe_browsing/murmurhash3_util.h"
21 #include "crypto/sha2.h"
22 #include "third_party/icu/source/common/unicode/ubrk.h"
23 #include "ui/base/l10n/l10n_util.h"
24
25 namespace safe_browsing {
26
27 // This time should be short enough that it doesn't noticeably disrupt the
28 // user's interaction with the page.
29 const int PhishingTermFeatureExtractor::kMaxTimePerChunkMs = 10;
30
31 // Experimenting shows that we get a reasonable gain in performance by
32 // increasing this up to around 10, but there's not much benefit in
33 // increasing it past that.
34 const int PhishingTermFeatureExtractor::kClockCheckGranularity = 5;
35
36 // This should be longer than we expect feature extraction to take on any
37 // actual phishing page.
38 const int PhishingTermFeatureExtractor::kMaxTotalTimeMs = 500;
39
40 // The maximum size of the negative word cache.
41 const int PhishingTermFeatureExtractor::kMaxNegativeWordCacheSize = 1000;
42
43 // All of the state pertaining to the current feature extraction.
44 struct PhishingTermFeatureExtractor::ExtractionState {
45   // Stores up to max_words_per_term_ previous words separated by spaces.
46   std::string previous_words;
47
48   // Stores the sizes of the words in previous_words.  Note: the size includes
49   // the space after each word.  In other words, the sum of all sizes in this
50   // list is equal to the length of previous_words.
51   std::list<size_t> previous_word_sizes;
52
53   // An iterator for word breaking.
54   UBreakIterator* iterator;
55
56   // Our current position in the text that was passed to the ExtractionState
57   // constructor, speciailly, the most recent break position returned by our
58   // iterator.
59   int position;
60
61   // True if position has been initialized.
62   bool position_initialized;
63
64   // The time at which we started feature extraction for the current page.
65   base::TimeTicks start_time;
66
67   // The number of iterations we've done for the current extraction.
68   int num_iterations;
69
70   ExtractionState(const string16& text, base::TimeTicks start_time_ticks)
71       : position(-1),
72         position_initialized(false),
73         start_time(start_time_ticks),
74         num_iterations(0) {
75     UErrorCode status = U_ZERO_ERROR;
76     // TODO(bryner): We should pass in the language for the document.
77     iterator = ubrk_open(UBRK_WORD, NULL,
78                          text.data(), text.size(),
79                          &status);
80     if (U_FAILURE(status)) {
81       DLOG(ERROR) << "ubrk_open failed: " << status;
82       iterator = NULL;
83     }
84   }
85
86   ~ExtractionState() {
87     if (iterator) {
88       ubrk_close(iterator);
89     }
90   }
91 };
92
93 PhishingTermFeatureExtractor::PhishingTermFeatureExtractor(
94     const base::hash_set<std::string>* page_term_hashes,
95     const base::hash_set<uint32>* page_word_hashes,
96     size_t max_words_per_term,
97     uint32 murmurhash3_seed,
98     FeatureExtractorClock* clock)
99     : page_term_hashes_(page_term_hashes),
100       page_word_hashes_(page_word_hashes),
101       max_words_per_term_(max_words_per_term),
102       murmurhash3_seed_(murmurhash3_seed),
103       negative_word_cache_(kMaxNegativeWordCacheSize),
104       clock_(clock),
105       weak_factory_(this) {
106   Clear();
107 }
108
109 PhishingTermFeatureExtractor::~PhishingTermFeatureExtractor() {
110   // The RenderView should have called CancelPendingExtraction() before
111   // we are destroyed.
112   CheckNoPendingExtraction();
113 }
114
115 void PhishingTermFeatureExtractor::ExtractFeatures(
116     const string16* page_text,
117     FeatureMap* features,
118     const DoneCallback& done_callback) {
119   // The RenderView should have called CancelPendingExtraction() before
120   // starting a new extraction, so DCHECK this.
121   CheckNoPendingExtraction();
122   // However, in an opt build, we will go ahead and clean up the pending
123   // extraction so that we can start in a known state.
124   CancelPendingExtraction();
125
126   page_text_ = page_text;
127   features_ = features;
128   done_callback_ = done_callback;
129
130   state_.reset(new ExtractionState(*page_text_, clock_->Now()));
131   base::MessageLoop::current()->PostTask(
132       FROM_HERE,
133       base::Bind(&PhishingTermFeatureExtractor::ExtractFeaturesWithTimeout,
134                  weak_factory_.GetWeakPtr()));
135 }
136
137 void PhishingTermFeatureExtractor::CancelPendingExtraction() {
138   // Cancel any pending callbacks, and clear our state.
139   weak_factory_.InvalidateWeakPtrs();
140   Clear();
141 }
142
143 void PhishingTermFeatureExtractor::ExtractFeaturesWithTimeout() {
144   DCHECK(state_.get());
145   ++state_->num_iterations;
146   base::TimeTicks current_chunk_start_time = clock_->Now();
147
148   if (!state_->iterator) {
149     // We failed to initialize the break iterator, so stop now.
150     UMA_HISTOGRAM_COUNTS("SBClientPhishing.TermFeatureBreakIterError", 1);
151     RunCallback(false);
152     return;
153   }
154
155   if (!state_->position_initialized) {
156     state_->position = ubrk_first(state_->iterator);
157     if (state_->position == UBRK_DONE) {
158       // No words present, so we're done.
159       RunCallback(true);
160       return;
161     }
162     state_->position_initialized = true;
163   }
164
165   int num_words = 0;
166   for (int next = ubrk_next(state_->iterator);
167        next != UBRK_DONE; next = ubrk_next(state_->iterator)) {
168     if (ubrk_getRuleStatus(state_->iterator) != UBRK_WORD_NONE) {
169       // next is now positioned at the end of a word.
170       HandleWord(base::StringPiece16(page_text_->data() + state_->position,
171                                      next - state_->position));
172       ++num_words;
173     }
174     state_->position = next;
175
176     if (num_words >= kClockCheckGranularity) {
177       num_words = 0;
178       base::TimeTicks now = clock_->Now();
179       if (now - state_->start_time >=
180           base::TimeDelta::FromMilliseconds(kMaxTotalTimeMs)) {
181         DLOG(ERROR) << "Feature extraction took too long, giving up";
182         // We expect this to happen infrequently, so record when it does.
183         UMA_HISTOGRAM_COUNTS("SBClientPhishing.TermFeatureTimeout", 1);
184         RunCallback(false);
185         return;
186       }
187       base::TimeDelta chunk_elapsed = now - current_chunk_start_time;
188       if (chunk_elapsed >=
189           base::TimeDelta::FromMilliseconds(kMaxTimePerChunkMs)) {
190         // The time limit for the current chunk is up, so post a task to
191         // continue extraction.
192         //
193         // Record how much time we actually spent on the chunk.  If this is
194         // much higher than kMaxTimePerChunkMs, we may need to adjust the
195         // clock granularity.
196         UMA_HISTOGRAM_TIMES("SBClientPhishing.TermFeatureChunkTime",
197                             chunk_elapsed);
198         base::MessageLoop::current()->PostTask(
199             FROM_HERE,
200             base::Bind(
201                 &PhishingTermFeatureExtractor::ExtractFeaturesWithTimeout,
202                 weak_factory_.GetWeakPtr()));
203         return;
204       }
205       // Otherwise, continue.
206     }
207   }
208   RunCallback(true);
209 }
210
211 void PhishingTermFeatureExtractor::HandleWord(
212     const base::StringPiece16& word) {
213   // Quickest out if we have seen this word before and know that it's not
214   // part of any term. This avoids the lowercasing and UTF conversion, both of
215   // which are relatively expensive.
216   if (negative_word_cache_.Get(word) != negative_word_cache_.end()) {
217     // We know we're no longer in a possible n-gram, so clear the previous word
218     // state.
219     state_->previous_words.clear();
220     state_->previous_word_sizes.clear();
221     return;
222   }
223
224   std::string word_lower = UTF16ToUTF8(base::i18n::ToLower(word));
225   uint32 word_hash = MurmurHash3String(word_lower, murmurhash3_seed_);
226
227   // Quick out if the word is not part of any term, which is the common case.
228   if (page_word_hashes_->find(word_hash) == page_word_hashes_->end()) {
229     // Word doesn't exist in our terms so we can clear the n-gram state.
230     state_->previous_words.clear();
231     state_->previous_word_sizes.clear();
232     // Insert into negative cache so that we don't try this again.
233     negative_word_cache_.Put(word, true);
234     return;
235   }
236
237   // Find all of the n-grams that we need to check and compute their SHA-256
238   // hashes.
239   std::map<std::string /* hash */, std::string /* plaintext */>
240       hashes_to_check;
241   hashes_to_check[crypto::SHA256HashString(word_lower)] = word_lower;
242
243   // Combine the new word with the previous words to find additional n-grams.
244   // Note that we don't yet add the new word length to previous_word_sizes,
245   // since we don't want to compute the hash for the word by itself again.
246   //
247   state_->previous_words.append(word_lower);
248   std::string current_term = state_->previous_words;
249   for (std::list<size_t>::iterator it = state_->previous_word_sizes.begin();
250        it != state_->previous_word_sizes.end(); ++it) {
251     hashes_to_check[crypto::SHA256HashString(current_term)] = current_term;
252     current_term.erase(0, *it);
253   }
254
255   // Add features for any hashes that match page_term_hashes_.
256   for (std::map<std::string, std::string>::iterator it =
257            hashes_to_check.begin();
258        it != hashes_to_check.end(); ++it) {
259     if (page_term_hashes_->find(it->first) != page_term_hashes_->end()) {
260       features_->AddBooleanFeature(features::kPageTerm + it->second);
261     }
262   }
263
264   // Now that we have handled the current word, we have to add a space at the
265   // end of it, and add the new word's size (including the space) to
266   // previous_word_sizes.  Note: it's possible that the document language
267   // doesn't use ASCII spaces to separate words.  That's fine though, we just
268   // need to be consistent with how the model is generated.
269   state_->previous_words.append(" ");
270   state_->previous_word_sizes.push_back(word_lower.size() + 1);
271
272   // Cap the number of previous words.
273   if (state_->previous_word_sizes.size() >= max_words_per_term_) {
274     state_->previous_words.erase(0, state_->previous_word_sizes.front());
275     state_->previous_word_sizes.pop_front();
276   }
277 }
278
279 void PhishingTermFeatureExtractor::CheckNoPendingExtraction() {
280   DCHECK(done_callback_.is_null());
281   DCHECK(!state_.get());
282   if (!done_callback_.is_null() || state_.get()) {
283     LOG(ERROR) << "Extraction in progress, missing call to "
284                << "CancelPendingExtraction";
285   }
286 }
287
288 void PhishingTermFeatureExtractor::RunCallback(bool success) {
289   // Record some timing stats that we can use to evaluate feature extraction
290   // performance.  These include both successful and failed extractions.
291   DCHECK(state_.get());
292   UMA_HISTOGRAM_COUNTS("SBClientPhishing.TermFeatureIterations",
293                        state_->num_iterations);
294   UMA_HISTOGRAM_TIMES("SBClientPhishing.TermFeatureTotalTime",
295                       clock_->Now() - state_->start_time);
296
297   DCHECK(!done_callback_.is_null());
298   done_callback_.Run(success);
299   Clear();
300 }
301
302 void PhishingTermFeatureExtractor::Clear() {
303   page_text_ = NULL;
304   features_ = NULL;
305   done_callback_.Reset();
306   state_.reset(NULL);
307   negative_word_cache_.Clear();
308 }
309
310 }  // namespace safe_browsing