Upstream version 10.39.225.0
[platform/framework/web/crosswalk.git] / src / chrome / browser / devtools / devtools_file_system_indexer.cc
1 // Copyright 2013 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "chrome/browser/devtools/devtools_file_system_indexer.h"
6
7 #include <iterator>
8
9 #include "base/bind.h"
10 #include "base/callback.h"
11 #include "base/files/file_enumerator.h"
12 #include "base/files/file_util.h"
13 #include "base/files/file_util_proxy.h"
14 #include "base/lazy_instance.h"
15 #include "base/logging.h"
16 #include "base/stl_util.h"
17 #include "base/strings/utf_string_conversions.h"
18 #include "content/public/browser/browser_thread.h"
19
20 using base::Bind;
21 using base::Callback;
22 using base::FileEnumerator;
23 using base::FilePath;
24 using base::Time;
25 using base::TimeDelta;
26 using base::TimeTicks;
27 using content::BrowserThread;
28 using std::map;
29 using std::set;
30 using std::string;
31 using std::vector;
32
33 namespace {
34
35 typedef int32 Trigram;
36 typedef char TrigramChar;
37 typedef uint16 FileId;
38
39 const int kMinTimeoutBetweenWorkedNitification = 200;
40 // Trigram characters include all ASCII printable characters (32-126) except for
41 // the capital letters, because the index is case insensitive.
42 const size_t kTrigramCharacterCount = 126 - 'Z' - 1 + 'A' - ' ' + 1;
43 const size_t kTrigramCount =
44     kTrigramCharacterCount * kTrigramCharacterCount * kTrigramCharacterCount;
45 const int kMaxReadLength = 10 * 1024;
46 const TrigramChar kUndefinedTrigramChar = -1;
47 const Trigram kUndefinedTrigram = -1;
48
49 base::LazyInstance<vector<bool> >::Leaky g_is_binary_char =
50     LAZY_INSTANCE_INITIALIZER;
51
52 base::LazyInstance<vector<TrigramChar> >::Leaky g_trigram_chars =
53     LAZY_INSTANCE_INITIALIZER;
54
55 class Index {
56  public:
57   Index();
58   Time LastModifiedTimeForFile(const FilePath& file_path);
59   void SetTrigramsForFile(const FilePath& file_path,
60                           const vector<Trigram>& index,
61                           const Time& time);
62   vector<FilePath> Search(string query);
63   void PrintStats();
64   void NormalizeVectors();
65
66  private:
67   ~Index();
68
69   FileId GetFileId(const FilePath& file_path);
70
71   typedef map<FilePath, FileId> FileIdsMap;
72   FileIdsMap file_ids_;
73   FileId last_file_id_;
74   // The index in this vector is the trigram id.
75   vector<vector<FileId> > index_;
76   typedef map<FilePath, Time> IndexedFilesMap;
77   IndexedFilesMap index_times_;
78   vector<bool> is_normalized_;
79
80   DISALLOW_COPY_AND_ASSIGN(Index);
81 };
82
83 base::LazyInstance<Index>::Leaky g_trigram_index = LAZY_INSTANCE_INITIALIZER;
84
85 void InitIsBinaryCharMap() {
86   for (size_t i = 0; i < 256; ++i) {
87     unsigned char ch = i;
88     bool is_binary_char = ch < 9 || (ch >= 14 && ch < 32) || ch == 127;
89     g_is_binary_char.Get().push_back(is_binary_char);
90   }
91 }
92
93 bool IsBinaryChar(char c) {
94   unsigned char uc = static_cast<unsigned char>(c);
95   return g_is_binary_char.Get()[uc];
96 }
97
98 void InitTrigramCharsMap() {
99   for (size_t i = 0; i < 256; ++i) {
100     if (i > 127) {
101       g_trigram_chars.Get().push_back(kUndefinedTrigramChar);
102       continue;
103     }
104     char ch = i;
105     if (ch == '\t')
106       ch = ' ';
107     if (ch >= 'A' && ch <= 'Z')
108       ch = ch - 'A' + 'a';
109     if ((IsBinaryChar(ch)) || (ch < ' ')) {
110       g_trigram_chars.Get().push_back(kUndefinedTrigramChar);
111       continue;
112     }
113
114     if (ch >= 'Z')
115       ch = ch - 'Z' - 1 + 'A';
116     ch -= ' ';
117     char signed_trigram_char_count = static_cast<char>(kTrigramCharacterCount);
118     CHECK(ch >= 0 && ch < signed_trigram_char_count);
119     g_trigram_chars.Get().push_back(ch);
120   }
121 }
122
123 TrigramChar TrigramCharForChar(char c) {
124   unsigned char uc = static_cast<unsigned char>(c);
125   return g_trigram_chars.Get()[uc];
126 }
127
128 Trigram TrigramAtIndex(const vector<TrigramChar>& trigram_chars, size_t index) {
129   static int kTrigramCharacterCountSquared =
130       kTrigramCharacterCount * kTrigramCharacterCount;
131   if (trigram_chars[index] == kUndefinedTrigramChar ||
132       trigram_chars[index + 1] == kUndefinedTrigramChar ||
133       trigram_chars[index + 2] == kUndefinedTrigramChar)
134     return kUndefinedTrigram;
135   Trigram trigram = kTrigramCharacterCountSquared * trigram_chars[index] +
136                     kTrigramCharacterCount * trigram_chars[index + 1] +
137                     trigram_chars[index + 2];
138   return trigram;
139 }
140
141 Index::Index() : last_file_id_(0) {
142   index_.resize(kTrigramCount);
143   is_normalized_.resize(kTrigramCount);
144   std::fill(is_normalized_.begin(), is_normalized_.end(), true);
145 }
146
147 Index::~Index() {}
148
149 Time Index::LastModifiedTimeForFile(const FilePath& file_path) {
150   DCHECK(BrowserThread::CurrentlyOn(BrowserThread::FILE));
151   Time last_modified_time;
152   if (index_times_.find(file_path) != index_times_.end())
153     last_modified_time = index_times_[file_path];
154   return last_modified_time;
155 }
156
157 void Index::SetTrigramsForFile(const FilePath& file_path,
158                                const vector<Trigram>& index,
159                                const Time& time) {
160   DCHECK(BrowserThread::CurrentlyOn(BrowserThread::FILE));
161   FileId file_id = GetFileId(file_path);
162   vector<Trigram>::const_iterator it = index.begin();
163   for (; it != index.end(); ++it) {
164     Trigram trigram = *it;
165     index_[trigram].push_back(file_id);
166     is_normalized_[trigram] = false;
167   }
168   index_times_[file_path] = time;
169 }
170
171 vector<FilePath> Index::Search(string query) {
172   DCHECK(BrowserThread::CurrentlyOn(BrowserThread::FILE));
173   const char* data = query.c_str();
174   vector<TrigramChar> trigram_chars;
175   trigram_chars.reserve(query.size());
176   for (size_t i = 0; i < query.size(); ++i)
177       trigram_chars.push_back(TrigramCharForChar(data[i]));
178   vector<Trigram> trigrams;
179   for (size_t i = 0; i + 2 < query.size(); ++i) {
180     Trigram trigram = TrigramAtIndex(trigram_chars, i);
181     if (trigram != kUndefinedTrigram)
182       trigrams.push_back(trigram);
183   }
184   set<FileId> file_ids;
185   bool first = true;
186   vector<Trigram>::const_iterator it = trigrams.begin();
187   for (; it != trigrams.end(); ++it) {
188     Trigram trigram = *it;
189     if (first) {
190       std::copy(index_[trigram].begin(),
191                 index_[trigram].end(),
192                 std::inserter(file_ids, file_ids.begin()));
193       first = false;
194       continue;
195     }
196     set<FileId> intersection = base::STLSetIntersection<set<FileId> >(
197         file_ids, index_[trigram]);
198     file_ids.swap(intersection);
199   }
200   vector<FilePath> result;
201   FileIdsMap::const_iterator ids_it = file_ids_.begin();
202   for (; ids_it != file_ids_.end(); ++ids_it) {
203     if (trigrams.size() == 0 ||
204         file_ids.find(ids_it->second) != file_ids.end()) {
205       result.push_back(ids_it->first);
206     }
207   }
208   return result;
209 }
210
211 FileId Index::GetFileId(const FilePath& file_path) {
212   DCHECK(BrowserThread::CurrentlyOn(BrowserThread::FILE));
213   string file_path_str = file_path.AsUTF8Unsafe();
214   if (file_ids_.find(file_path) != file_ids_.end())
215     return file_ids_[file_path];
216   file_ids_[file_path] = ++last_file_id_;
217   return last_file_id_;
218 }
219
220 void Index::NormalizeVectors() {
221   DCHECK(BrowserThread::CurrentlyOn(BrowserThread::FILE));
222   for (size_t i = 0; i < kTrigramCount; ++i) {
223     if (!is_normalized_[i]) {
224       std::sort(index_[i].begin(), index_[i].end());
225       if (index_[i].capacity() > index_[i].size())
226         vector<FileId>(index_[i]).swap(index_[i]);
227       is_normalized_[i] = true;
228     }
229   }
230 }
231
232 void Index::PrintStats() {
233   DCHECK(BrowserThread::CurrentlyOn(BrowserThread::FILE));
234   LOG(ERROR) << "Index stats:";
235   size_t size = 0;
236   size_t maxSize = 0;
237   size_t capacity = 0;
238   for (size_t i = 0; i < kTrigramCount; ++i) {
239     if (index_[i].size() > maxSize)
240       maxSize = index_[i].size();
241     size += index_[i].size();
242     capacity += index_[i].capacity();
243   }
244   LOG(ERROR) << "  - total trigram count: " << size;
245   LOG(ERROR) << "  - max file count per trigram: " << maxSize;
246   LOG(ERROR) << "  - total vectors capacity " << capacity;
247   size_t total_index_size =
248       capacity * sizeof(FileId) + sizeof(vector<FileId>) * kTrigramCount;
249   LOG(ERROR) << "  - estimated total index size " << total_index_size;
250 }
251
252 typedef Callback<void(bool, const vector<bool>&)> IndexerCallback;
253
254 }  // namespace
255
256 DevToolsFileSystemIndexer::FileSystemIndexingJob::FileSystemIndexingJob(
257     const FilePath& file_system_path,
258     const TotalWorkCallback& total_work_callback,
259     const WorkedCallback& worked_callback,
260     const DoneCallback& done_callback)
261     : file_system_path_(file_system_path),
262       total_work_callback_(total_work_callback),
263       worked_callback_(worked_callback),
264       done_callback_(done_callback),
265       current_file_(BrowserThread::GetMessageLoopProxyForThread(
266                         BrowserThread::FILE).get()),
267       files_indexed_(0),
268       stopped_(false) {
269   current_trigrams_set_.resize(kTrigramCount);
270   current_trigrams_.reserve(kTrigramCount);
271 }
272
273 DevToolsFileSystemIndexer::FileSystemIndexingJob::~FileSystemIndexingJob() {}
274
275 void DevToolsFileSystemIndexer::FileSystemIndexingJob::Start() {
276   DCHECK(BrowserThread::CurrentlyOn(BrowserThread::UI));
277   BrowserThread::PostTask(
278       BrowserThread::FILE,
279       FROM_HERE,
280       Bind(&FileSystemIndexingJob::CollectFilesToIndex, this));
281 }
282
283 void DevToolsFileSystemIndexer::FileSystemIndexingJob::Stop() {
284   DCHECK(BrowserThread::CurrentlyOn(BrowserThread::UI));
285   BrowserThread::PostTask(BrowserThread::FILE,
286                           FROM_HERE,
287                           Bind(&FileSystemIndexingJob::StopOnFileThread, this));
288 }
289
290 void DevToolsFileSystemIndexer::FileSystemIndexingJob::StopOnFileThread() {
291   stopped_ = true;
292 }
293
294 void DevToolsFileSystemIndexer::FileSystemIndexingJob::CollectFilesToIndex() {
295   DCHECK(BrowserThread::CurrentlyOn(BrowserThread::FILE));
296   if (stopped_)
297     return;
298   if (!file_enumerator_) {
299     file_enumerator_.reset(
300         new FileEnumerator(file_system_path_, true, FileEnumerator::FILES));
301   }
302   FilePath file_path = file_enumerator_->Next();
303   if (file_path.empty()) {
304     BrowserThread::PostTask(
305         BrowserThread::UI,
306         FROM_HERE,
307         Bind(total_work_callback_, file_path_times_.size()));
308     indexing_it_ = file_path_times_.begin();
309     IndexFiles();
310     return;
311   }
312   Time saved_last_modified_time =
313       g_trigram_index.Get().LastModifiedTimeForFile(file_path);
314   FileEnumerator::FileInfo file_info = file_enumerator_->GetInfo();
315   Time current_last_modified_time = file_info.GetLastModifiedTime();
316   if (current_last_modified_time > saved_last_modified_time) {
317     file_path_times_[file_path] = current_last_modified_time;
318   }
319   BrowserThread::PostTask(
320       BrowserThread::FILE,
321       FROM_HERE,
322       Bind(&FileSystemIndexingJob::CollectFilesToIndex, this));
323 }
324
325 void DevToolsFileSystemIndexer::FileSystemIndexingJob::IndexFiles() {
326   DCHECK(BrowserThread::CurrentlyOn(BrowserThread::FILE));
327   if (stopped_)
328     return;
329   if (indexing_it_ == file_path_times_.end()) {
330     g_trigram_index.Get().NormalizeVectors();
331     BrowserThread::PostTask(BrowserThread::UI, FROM_HERE, done_callback_);
332     return;
333   }
334   FilePath file_path = indexing_it_->first;
335   current_file_.CreateOrOpen(
336         file_path,
337         base::File::FLAG_OPEN | base::File::FLAG_READ,
338         Bind(&FileSystemIndexingJob::StartFileIndexing, this));
339 }
340
341 void DevToolsFileSystemIndexer::FileSystemIndexingJob::StartFileIndexing(
342     base::File::Error error) {
343   if (!current_file_.IsValid()) {
344     FinishFileIndexing(false);
345     return;
346   }
347   current_file_offset_ = 0;
348   current_trigrams_.clear();
349   std::fill(current_trigrams_set_.begin(), current_trigrams_set_.end(), false);
350   ReadFromFile();
351 }
352
353 void DevToolsFileSystemIndexer::FileSystemIndexingJob::ReadFromFile() {
354   if (stopped_) {
355     CloseFile();
356     return;
357   }
358   current_file_.Read(current_file_offset_, kMaxReadLength,
359                      Bind(&FileSystemIndexingJob::OnRead, this));
360 }
361
362 void DevToolsFileSystemIndexer::FileSystemIndexingJob::OnRead(
363     base::File::Error error,
364     const char* data,
365     int bytes_read) {
366   if (error != base::File::FILE_OK) {
367     FinishFileIndexing(false);
368     return;
369   }
370
371   if (!bytes_read || bytes_read < 3) {
372     FinishFileIndexing(true);
373     return;
374   }
375
376   size_t size = static_cast<size_t>(bytes_read);
377   vector<TrigramChar> trigram_chars;
378   trigram_chars.reserve(size);
379   for (size_t i = 0; i < size; ++i) {
380     if (IsBinaryChar(data[i])) {
381       current_trigrams_.clear();
382       FinishFileIndexing(true);
383       return;
384     }
385     trigram_chars.push_back(TrigramCharForChar(data[i]));
386   }
387
388   for (size_t i = 0; i + 2 < size; ++i) {
389     Trigram trigram = TrigramAtIndex(trigram_chars, i);
390     if ((trigram != kUndefinedTrigram) && !current_trigrams_set_[trigram]) {
391       current_trigrams_set_[trigram] = true;
392       current_trigrams_.push_back(trigram);
393     }
394   }
395   current_file_offset_ += bytes_read - 2;
396   ReadFromFile();
397 }
398
399 void DevToolsFileSystemIndexer::FileSystemIndexingJob::FinishFileIndexing(
400     bool success) {
401   DCHECK(BrowserThread::CurrentlyOn(BrowserThread::FILE));
402   CloseFile();
403   if (success) {
404     FilePath file_path = indexing_it_->first;
405     g_trigram_index.Get().SetTrigramsForFile(
406         file_path, current_trigrams_, file_path_times_[file_path]);
407   }
408   ReportWorked();
409   ++indexing_it_;
410   IndexFiles();
411 }
412
413 void DevToolsFileSystemIndexer::FileSystemIndexingJob::CloseFile() {
414   if (current_file_.IsValid())
415     current_file_.Close(Bind(&FileSystemIndexingJob::CloseCallback, this));
416 }
417
418 void DevToolsFileSystemIndexer::FileSystemIndexingJob::CloseCallback(
419     base::File::Error error) {}
420
421 void DevToolsFileSystemIndexer::FileSystemIndexingJob::ReportWorked() {
422   TimeTicks current_time = TimeTicks::Now();
423   bool should_send_worked_nitification = true;
424   if (!last_worked_notification_time_.is_null()) {
425     TimeDelta delta = current_time - last_worked_notification_time_;
426     if (delta.InMilliseconds() < kMinTimeoutBetweenWorkedNitification)
427       should_send_worked_nitification = false;
428   }
429   ++files_indexed_;
430   if (should_send_worked_nitification) {
431     last_worked_notification_time_ = current_time;
432     BrowserThread::PostTask(
433         BrowserThread::UI, FROM_HERE, Bind(worked_callback_, files_indexed_));
434     files_indexed_ = 0;
435   }
436 }
437
438 DevToolsFileSystemIndexer::DevToolsFileSystemIndexer() {
439   static bool maps_initialized = false;
440   if (!maps_initialized) {
441     InitIsBinaryCharMap();
442     InitTrigramCharsMap();
443     maps_initialized = true;
444   }
445 }
446
447 DevToolsFileSystemIndexer::~DevToolsFileSystemIndexer() {}
448
449 scoped_refptr<DevToolsFileSystemIndexer::FileSystemIndexingJob>
450 DevToolsFileSystemIndexer::IndexPath(
451     const string& file_system_path,
452     const TotalWorkCallback& total_work_callback,
453     const WorkedCallback& worked_callback,
454     const DoneCallback& done_callback) {
455   DCHECK(BrowserThread::CurrentlyOn(BrowserThread::UI));
456   scoped_refptr<FileSystemIndexingJob> indexing_job =
457       new FileSystemIndexingJob(FilePath::FromUTF8Unsafe(file_system_path),
458                                 total_work_callback,
459                                 worked_callback,
460                                 done_callback);
461   indexing_job->Start();
462   return indexing_job;
463 }
464
465 void DevToolsFileSystemIndexer::SearchInPath(const string& file_system_path,
466                                              const string& query,
467                                              const SearchCallback& callback) {
468   DCHECK(BrowserThread::CurrentlyOn(BrowserThread::UI));
469   BrowserThread::PostTask(
470       BrowserThread::FILE,
471       FROM_HERE,
472       Bind(&DevToolsFileSystemIndexer::SearchInPathOnFileThread,
473            this,
474            file_system_path,
475            query,
476            callback));
477 }
478
479 void DevToolsFileSystemIndexer::SearchInPathOnFileThread(
480     const string& file_system_path,
481     const string& query,
482     const SearchCallback& callback) {
483   DCHECK(BrowserThread::CurrentlyOn(BrowserThread::FILE));
484   vector<FilePath> file_paths = g_trigram_index.Get().Search(query);
485   vector<string> result;
486   FilePath path = FilePath::FromUTF8Unsafe(file_system_path);
487   vector<FilePath>::const_iterator it = file_paths.begin();
488   for (; it != file_paths.end(); ++it) {
489     if (path.IsParent(*it))
490       result.push_back(it->AsUTF8Unsafe());
491   }
492   BrowserThread::PostTask(BrowserThread::UI, FROM_HERE, Bind(callback, result));
493 }