1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "chrome/browser/history/query_parser.h"
9 #include "base/compiler_specific.h"
10 #include "base/i18n/break_iterator.h"
11 #include "base/i18n/case_conversion.h"
12 #include "base/logging.h"
13 #include "base/stl_util.h"
14 #include "base/strings/utf_string_conversions.h"
18 // Returns true if |mp1.first| is less than |mp2.first|. This is used to
19 // sort match positions.
20 int CompareMatchPosition(const Snippet::MatchPosition& mp1,
21 const Snippet::MatchPosition& mp2) {
22 return mp1.first < mp2.first;
25 // Returns true if |mp2| intersects |mp1|. This is intended for use by
26 // CoalesceMatchesFrom and isn't meant as a general intersection comparison
28 bool SnippetIntersects(const Snippet::MatchPosition& mp1,
29 const Snippet::MatchPosition& mp2) {
30 return mp2.first >= mp1.first && mp2.first <= mp1.second;
33 // Coalesces match positions in |matches| after index that intersect the match
34 // position at |index|.
35 void CoalesceMatchesFrom(size_t index, Snippet::MatchPositions* matches) {
36 Snippet::MatchPosition& mp = (*matches)[index];
37 for (Snippet::MatchPositions::iterator i = matches->begin() + index + 1;
38 i != matches->end(); ) {
39 if (SnippetIntersects(mp, *i)) {
40 mp.second = std::max(mp.second, i->second);
41 i = matches->erase(i);
48 // Sorts the match positions in |matches| by their first index, then coalesces
49 // any match positions that intersect each other.
50 void CoalseAndSortMatchPositions(Snippet::MatchPositions* matches) {
51 std::sort(matches->begin(), matches->end(), &CompareMatchPosition);
52 // WARNING: we don't use iterator here as CoalesceMatchesFrom may remove
54 for (size_t i = 0; i < matches->size(); ++i)
55 CoalesceMatchesFrom(i, matches);
58 // Returns true if the character is considered a quote.
59 bool IsQueryQuote(wchar_t ch) {
61 ch == 0xab || // left pointing double angle bracket
62 ch == 0xbb || // right pointing double angle bracket
63 ch == 0x201c || // left double quotation mark
64 ch == 0x201d || // right double quotation mark
65 ch == 0x201e; // double low-9 quotation mark
70 // Inheritance structure:
71 // Queries are represented as trees of QueryNodes.
72 // QueryNodes are either a collection of subnodes (a QueryNodeList)
73 // or a single word (a QueryNodeWord).
75 // A QueryNodeWord is a single word in the query.
76 class QueryNodeWord : public QueryNode {
78 explicit QueryNodeWord(const base::string16& word);
79 virtual ~QueryNodeWord();
81 const base::string16& word() const { return word_; }
83 void set_literal(bool literal) { literal_ = literal; }
86 virtual int AppendToSQLiteQuery(base::string16* query) const OVERRIDE;
87 virtual bool IsWord() const OVERRIDE;
88 virtual bool Matches(const base::string16& word, bool exact) const OVERRIDE;
89 virtual bool HasMatchIn(
90 const std::vector<QueryWord>& words,
91 Snippet::MatchPositions* match_positions) const OVERRIDE;
92 virtual bool HasMatchIn(
93 const std::vector<QueryWord>& words) const OVERRIDE;
94 virtual void AppendWords(std::vector<base::string16>* words) const OVERRIDE;
100 DISALLOW_COPY_AND_ASSIGN(QueryNodeWord);
103 QueryNodeWord::QueryNodeWord(const base::string16& word)
107 QueryNodeWord::~QueryNodeWord() {}
109 int QueryNodeWord::AppendToSQLiteQuery(base::string16* query) const {
110 query->append(word_);
112 // Use prefix search if we're not literal and long enough.
113 if (!literal_ && QueryParser::IsWordLongEnoughForPrefixSearch(word_))
118 bool QueryNodeWord::IsWord() const {
122 bool QueryNodeWord::Matches(const base::string16& word, bool exact) const {
123 if (exact || !QueryParser::IsWordLongEnoughForPrefixSearch(word_))
124 return word == word_;
125 return word.size() >= word_.size() &&
126 (word_.compare(0, word_.size(), word, 0, word_.size()) == 0);
129 bool QueryNodeWord::HasMatchIn(const std::vector<QueryWord>& words,
130 Snippet::MatchPositions* match_positions) const {
131 bool matched = false;
132 for (size_t i = 0; i < words.size(); ++i) {
133 if (Matches(words[i].word, false)) {
134 size_t match_start = words[i].position;
135 match_positions->push_back(
136 Snippet::MatchPosition(match_start,
137 match_start + static_cast<int>(word_.size())));
144 bool QueryNodeWord::HasMatchIn(const std::vector<QueryWord>& words) const {
145 for (size_t i = 0; i < words.size(); ++i) {
146 if (Matches(words[i].word, false))
152 void QueryNodeWord::AppendWords(std::vector<base::string16>* words) const {
153 words->push_back(word_);
156 // A QueryNodeList has a collection of QueryNodes which are deleted in the end.
157 class QueryNodeList : public QueryNode {
159 typedef std::vector<QueryNode*> QueryNodeVector;
162 virtual ~QueryNodeList();
164 QueryNodeVector* children() { return &children_; }
166 void AddChild(QueryNode* node);
168 // Remove empty subnodes left over from other parsing.
169 void RemoveEmptySubnodes();
172 virtual int AppendToSQLiteQuery(base::string16* query) const OVERRIDE;
173 virtual bool IsWord() const OVERRIDE;
174 virtual bool Matches(const base::string16& word, bool exact) const OVERRIDE;
175 virtual bool HasMatchIn(
176 const std::vector<QueryWord>& words,
177 Snippet::MatchPositions* match_positions) const OVERRIDE;
178 virtual bool HasMatchIn(
179 const std::vector<QueryWord>& words) const OVERRIDE;
180 virtual void AppendWords(std::vector<base::string16>* words) const OVERRIDE;
183 int AppendChildrenToString(base::string16* query) const;
185 QueryNodeVector children_;
188 DISALLOW_COPY_AND_ASSIGN(QueryNodeList);
191 QueryNodeList::QueryNodeList() {}
193 QueryNodeList::~QueryNodeList() {
194 STLDeleteElements(&children_);
197 void QueryNodeList::AddChild(QueryNode* node) {
198 children_.push_back(node);
201 void QueryNodeList::RemoveEmptySubnodes() {
202 for (size_t i = 0; i < children_.size(); ++i) {
203 if (children_[i]->IsWord())
206 QueryNodeList* list_node = static_cast<QueryNodeList*>(children_[i]);
207 list_node->RemoveEmptySubnodes();
208 if (list_node->children()->empty()) {
209 children_.erase(children_.begin() + i);
216 int QueryNodeList::AppendToSQLiteQuery(base::string16* query) const {
217 return AppendChildrenToString(query);
220 bool QueryNodeList::IsWord() const {
224 bool QueryNodeList::Matches(const base::string16& word, bool exact) const {
229 bool QueryNodeList::HasMatchIn(const std::vector<QueryWord>& words,
230 Snippet::MatchPositions* match_positions) const {
235 bool QueryNodeList::HasMatchIn(const std::vector<QueryWord>& words) const {
240 void QueryNodeList::AppendWords(std::vector<base::string16>* words) const {
241 for (size_t i = 0; i < children_.size(); ++i)
242 children_[i]->AppendWords(words);
245 int QueryNodeList::AppendChildrenToString(base::string16* query) const {
247 for (QueryNodeVector::const_iterator node = children_.begin();
248 node != children_.end(); ++node) {
249 if (node != children_.begin())
250 query->push_back(L' ');
251 num_words += (*node)->AppendToSQLiteQuery(query);
256 // A QueryNodePhrase is a phrase query ("quoted").
257 class QueryNodePhrase : public QueryNodeList {
260 virtual ~QueryNodePhrase();
263 virtual int AppendToSQLiteQuery(base::string16* query) const OVERRIDE;
264 virtual bool HasMatchIn(
265 const std::vector<QueryWord>& words,
266 Snippet::MatchPositions* match_positions) const OVERRIDE;
267 virtual bool HasMatchIn(
268 const std::vector<QueryWord>& words) const OVERRIDE;
271 bool MatchesAll(const std::vector<QueryWord>& words,
272 const QueryWord** first_word,
273 const QueryWord** last_word) const;
274 DISALLOW_COPY_AND_ASSIGN(QueryNodePhrase);
277 QueryNodePhrase::QueryNodePhrase() {}
279 QueryNodePhrase::~QueryNodePhrase() {}
281 int QueryNodePhrase::AppendToSQLiteQuery(base::string16* query) const {
282 query->push_back(L'"');
283 int num_words = AppendChildrenToString(query);
284 query->push_back(L'"');
288 bool QueryNodePhrase::MatchesAll(const std::vector<QueryWord>& words,
289 const QueryWord** first_word,
290 const QueryWord** last_word) const {
291 if (words.size() < children_.size())
294 for (size_t i = 0, max = words.size() - children_.size() + 1; i < max; ++i) {
295 bool matched_all = true;
296 for (size_t j = 0; j < children_.size(); ++j) {
297 if (!children_[j]->Matches(words[i + j].word, true)) {
303 *first_word = &words[i];
304 *last_word = &words[i + children_.size() - 1];
311 bool QueryNodePhrase::HasMatchIn(
312 const std::vector<QueryWord>& words,
313 Snippet::MatchPositions* match_positions) const {
314 const QueryWord* first_word;
315 const QueryWord* last_word;
317 if (MatchesAll(words, &first_word, &last_word)) {
318 match_positions->push_back(
319 Snippet::MatchPosition(first_word->position,
320 last_word->position + last_word->word.length()));
326 bool QueryNodePhrase::HasMatchIn(const std::vector<QueryWord>& words) const {
327 const QueryWord* first_word;
328 const QueryWord* last_word;
329 return MatchesAll(words, &first_word, &last_word);
332 QueryParser::QueryParser() {}
335 bool QueryParser::IsWordLongEnoughForPrefixSearch(const base::string16& word) {
336 DCHECK(!word.empty());
337 size_t minimum_length = 3;
338 // We intentionally exclude Hangul Jamos (both Conjoining and compatibility)
339 // because they 'behave like' Latin letters. Moreover, we should
340 // normalize the former before reaching here.
341 if (0xAC00 <= word[0] && word[0] <= 0xD7A3)
343 return word.size() >= minimum_length;
346 int QueryParser::ParseQuery(const base::string16& query,
347 base::string16* sqlite_query) {
349 if (!ParseQueryImpl(query, &root))
351 return root.AppendToSQLiteQuery(sqlite_query);
354 void QueryParser::ParseQueryWords(const base::string16& query,
355 std::vector<base::string16>* words) {
357 if (!ParseQueryImpl(query, &root))
359 root.AppendWords(words);
362 void QueryParser::ParseQueryNodes(const base::string16& query,
363 std::vector<QueryNode*>* nodes) {
365 if (ParseQueryImpl(base::i18n::ToLower(query), &root))
366 nodes->swap(*root.children());
369 bool QueryParser::DoesQueryMatch(const base::string16& text,
370 const std::vector<QueryNode*>& query_nodes,
371 Snippet::MatchPositions* match_positions) {
372 if (query_nodes.empty())
375 std::vector<QueryWord> query_words;
376 base::string16 lower_text = base::i18n::ToLower(text);
377 ExtractQueryWords(lower_text, &query_words);
379 if (query_words.empty())
382 Snippet::MatchPositions matches;
383 for (size_t i = 0; i < query_nodes.size(); ++i) {
384 if (!query_nodes[i]->HasMatchIn(query_words, &matches))
387 if (lower_text.length() != text.length()) {
388 // The lower case string differs from the original string. The matches are
390 // TODO(sky): we need a better way to align the positions so that we don't
391 // completely punt here.
392 match_positions->clear();
394 CoalseAndSortMatchPositions(&matches);
395 match_positions->swap(matches);
400 bool QueryParser::DoesQueryMatch(const std::vector<QueryWord>& query_words,
401 const std::vector<QueryNode*>& query_nodes) {
402 if (query_nodes.empty() || query_words.empty())
405 for (size_t i = 0; i < query_nodes.size(); ++i) {
406 if (!query_nodes[i]->HasMatchIn(query_words))
412 bool QueryParser::ParseQueryImpl(const base::string16& query,
413 QueryNodeList* root) {
414 base::i18n::BreakIterator iter(query, base::i18n::BreakIterator::BREAK_WORD);
415 // TODO(evanm): support a locale here
419 // To handle nesting, we maintain a stack of QueryNodeLists.
420 // The last element (back) of the stack contains the current, deepest node.
421 std::vector<QueryNodeList*> query_stack;
422 query_stack.push_back(root);
424 bool in_quotes = false; // whether we're currently in a quoted phrase
425 while (iter.Advance()) {
426 // Just found a span between 'prev' (inclusive) and 'pos' (exclusive). It
427 // is not necessarily a word, but could also be a sequence of punctuation
430 QueryNodeWord* word_node = new QueryNodeWord(iter.GetString());
432 word_node->set_literal(true);
433 query_stack.back()->AddChild(word_node);
434 } else { // Punctuation.
435 if (IsQueryQuote(query[iter.prev()])) {
437 QueryNodeList* quotes_node = new QueryNodePhrase;
438 query_stack.back()->AddChild(quotes_node);
439 query_stack.push_back(quotes_node);
442 query_stack.pop_back(); // Stop adding to the quoted phrase.
449 root->RemoveEmptySubnodes();
453 void QueryParser::ExtractQueryWords(const base::string16& text,
454 std::vector<QueryWord>* words) {
455 base::i18n::BreakIterator iter(text, base::i18n::BreakIterator::BREAK_WORD);
456 // TODO(evanm): support a locale here
460 while (iter.Advance()) {
461 // Just found a span between 'prev' (inclusive) and 'pos' (exclusive). It
462 // is not necessarily a word, but could also be a sequence of punctuation
465 base::string16 word = iter.GetString();
467 words->push_back(QueryWord());
468 words->back().word = word;
469 words->back().position = iter.prev();