src/chrome/browser/history/snippet.cc

   1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style license that can be
   3 // found in the LICENSE file.
   4
   5 #include "chrome/browser/history/snippet.h"
   6
   7 #include <algorithm>
   8
   9 #include "base/logging.h"
  10 #include "base/memory/scoped_ptr.h"
  11 #include "base/strings/string_split.h"
  12 #include "base/strings/string_util.h"
  13 #include "base/strings/utf_string_conversions.h"
  14 #include "third_party/icu/source/common/unicode/brkiter.h"
  15 #include "third_party/icu/source/common/unicode/utext.h"
  16 #include "third_party/icu/source/common/unicode/utf8.h"
  17
  18 namespace {
  19
  20 bool PairFirstLessThan(const Snippet::MatchPosition& a,
  21                        const Snippet::MatchPosition& b) {
  22   return a.first < b.first;
  23 }
  24
  25 // Combines all pairs after offset in match_positions that are contained
  26 // or touch the pair at offset.
  27 void CoalescePositionsFrom(size_t offset,
  28                            Snippet::MatchPositions* match_positions) {
  29   DCHECK(offset < match_positions->size());
  30   Snippet::MatchPosition& pair((*match_positions)[offset]);
  31   ++offset;
  32   while (offset < match_positions->size() &&
  33          pair.second >= (*match_positions)[offset].first) {
  34     pair.second = std::max(pair.second, (*match_positions)[offset].second);
  35     match_positions->erase(match_positions->begin() + offset);
  36   }
  37 }
  38
  39 // Makes sure there is a pair in match_positions that contains the specified
  40 // range. This keeps the pairs ordered in match_positions by first, and makes
  41 // sure none of the pairs in match_positions touch each other.
  42 void AddMatch(size_t start,
  43               size_t end,
  44               Snippet::MatchPositions* match_positions) {
  45   DCHECK(start < end);
  46   DCHECK(match_positions);
  47   Snippet::MatchPosition pair(start, end);
  48   if (match_positions->empty()) {
  49     match_positions->push_back(pair);
  50     return;
  51   }
  52   // There's at least one match. Find the position of the new match,
  53   // potentially extending pairs around it.
  54   Snippet::MatchPositions::iterator i =
  55       std::lower_bound(match_positions->begin(), match_positions->end(),
  56                        pair, &PairFirstLessThan);
  57   if (i != match_positions->end() && i->first == start) {
  58     // Match not at the end and there is already a pair with the same
  59     // start.
  60     if (end > i->second) {
  61       // New pair extends beyond existing pair. Extend existing pair and
  62       // coalesce matches after it.
  63       i->second = end;
  64       CoalescePositionsFrom(i - match_positions->begin(), match_positions);
  65     }  // else case, new pair completely contained in existing pair, nothing
  66        // to do.
  67   } else if (i == match_positions->begin()) {
  68     // Match at the beginning and the first pair doesn't have the same
  69     // start. Insert new pair and coalesce matches after it.
  70     match_positions->insert(i, pair);
  71     CoalescePositionsFrom(0, match_positions);
  72   } else {
  73     // Not at the beginning (but may be at the end).
  74     --i;
  75     if (start <= i->second && end > i->second) {
  76       // Previous element contains match. Extend it and coalesce.
  77       i->second = end;
  78       CoalescePositionsFrom(i - match_positions->begin(), match_positions);
  79     } else if (end > i->second) {
  80       // Region doesn't touch previous element. See if region touches current
  81       // element.
  82       ++i;
  83       if (i == match_positions->end() || end < i->first) {
  84         match_positions->insert(i, pair);
  85       } else {
  86         i->first = start;
  87         i->second = end;
  88         CoalescePositionsFrom(i - match_positions->begin(), match_positions);
  89       }
  90     }
  91   }
  92 }
  93
  94 // Converts an index in a utf8 string into the index in the corresponding utf16
  95 // string and returns the utf16 index. This is intended to be called in a loop
  96 // iterating through a utf8 string.
  97 //
  98 // utf8_string: the utf8 string.
  99 // utf8_length: length of the utf8 string.
 100 // offset: the utf8 offset to convert.
 101 // utf8_pos: current offset in the utf8 string. This is modified and on return
 102 //           matches offset.
 103 // wide_pos: current index in the wide string. This is the same as the return
 104 //           value.
 105 size_t AdvanceAndReturnUTF16Pos(const char* utf8_string,
 106                                 int32_t utf8_length,
 107                                 int32_t offset,
 108                                 int32_t* utf8_pos,
 109                                 size_t* utf16_pos) {
 110   DCHECK(offset >= *utf8_pos && offset <= utf8_length);
 111
 112   UChar32 wide_char;
 113   while (*utf8_pos < offset) {
 114     U8_NEXT(utf8_string, *utf8_pos, utf8_length, wide_char);
 115     *utf16_pos += (wide_char <= 0xFFFF) ? 1 : 2;
 116   }
 117   return *utf16_pos;
 118 }
 119
 120 // Given a character break iterator over a UTF-8 string, set the iterator
 121 // position to |*utf8_pos| and move by |count| characters. |count| can
 122 // be either positive or negative.
 123 void MoveByNGraphemes(icu::BreakIterator* bi, int count, size_t* utf8_pos) {
 124   // Ignore the return value. A side effect of the current position
 125   // being set at or following |*utf8_pos| is exploited here.
 126   // It's simpler than calling following(n) and then previous().
 127   // isBoundary() is not very fast, but should be good enough for the
 128   // snippet generation. If not, revisit the way we scan in ComputeSnippet.
 129   bi->isBoundary(*utf8_pos);
 130   bi->next(count);
 131   *utf8_pos = static_cast<size_t>(bi->current());
 132 }
 133
 134 // The amount of context to include for a given hit. Note that it's counted
 135 // in terms of graphemes rather than bytes.
 136 const int kSnippetContext = 50;
 137
 138 // Returns true if next match falls within a snippet window
 139 // from the previous match. The window size is counted in terms
 140 // of graphemes rather than bytes in UTF-8.
 141 bool IsNextMatchWithinSnippetWindow(icu::BreakIterator* bi,
 142                                     size_t previous_match_end,
 143                                     size_t next_match_start) {
 144   // If it's within a window in terms of bytes, it's certain
 145   // that it's within a window in terms of graphemes as well.
 146   if (next_match_start < previous_match_end + kSnippetContext)
 147     return true;
 148   bi->isBoundary(previous_match_end);
 149   // An alternative to this is to call |bi->next()| at most
 150   // kSnippetContext times, compare |bi->current()| with |next_match_start|
 151   // after each call and return early if possible. There are other
 152   // heuristics to speed things up if necessary, but it's not likely that
 153   // we need to bother.
 154   bi->next(kSnippetContext);
 155   int64 current = bi->current();
 156   return (next_match_start < static_cast<uint64>(current) ||
 157           current == icu::BreakIterator::DONE);
 158 }
 159
 160 }  // namespace
 161
 162 // static
 163 void Snippet::ExtractMatchPositions(const std::string& offsets_str,
 164                                     const std::string& column_num,
 165                                     MatchPositions* match_positions) {
 166   DCHECK(match_positions);
 167   if (offsets_str.empty())
 168     return;
 169   std::vector<std::string> offsets;
 170   base::SplitString(offsets_str, ' ', &offsets);
 171   // SQLite offsets are sets of four integers:
 172   //   column, query term, match offset, match length
 173   // Matches within a string are marked by (start, end) pairs.
 174   for (size_t i = 0; i < offsets.size() - 3; i += 4) {
 175     if (offsets[i] != column_num)
 176       continue;
 177     const size_t start = atoi(offsets[i + 2].c_str());
 178     const size_t end = start + atoi(offsets[i + 3].c_str());
 179     // Switch to DCHECK after debugging http://crbug.com/15261.
 180     CHECK(end >= start);
 181     AddMatch(start, end, match_positions);
 182   }
 183 }
 184
 185 // static
 186 void Snippet::ConvertMatchPositionsToWide(
 187     const std::string& utf8_string,
 188     Snippet::MatchPositions* match_positions) {
 189   DCHECK(match_positions);
 190   int32_t utf8_pos = 0;
 191   size_t utf16_pos = 0;
 192   const char* utf8_cstring = utf8_string.c_str();
 193   const int32_t utf8_length = static_cast<int32_t>(utf8_string.size());
 194   for (Snippet::MatchPositions::iterator i = match_positions->begin();
 195        i != match_positions->end(); ++i) {
 196     i->first = AdvanceAndReturnUTF16Pos(utf8_cstring, utf8_length,
 197                                         i->first, &utf8_pos, &utf16_pos);
 198     i->second = AdvanceAndReturnUTF16Pos(utf8_cstring, utf8_length,
 199                                          i->second, &utf8_pos, &utf16_pos);
 200   }
 201 }
 202
 203 Snippet::Snippet() {
 204 }
 205
 206 Snippet::~Snippet() {
 207 }
 208
 209 void Snippet::ComputeSnippet(const MatchPositions& match_positions,
 210                              const std::string& document) {
 211   // The length of snippets we try to produce.
 212   // We can generate longer snippets but stop once we cross kSnippetMaxLength.
 213   const size_t kSnippetMaxLength = 200;
 214   const string16 kEllipsis = ASCIIToUTF16(" ... ");
 215
 216   UText* document_utext = NULL;
 217   UErrorCode status = U_ZERO_ERROR;
 218   document_utext = utext_openUTF8(document_utext, document.data(),
 219                                   document.size(), &status);
 220   // Locale does not matter because there's no per-locale customization
 221   // for character iterator.
 222   scoped_ptr<icu::BreakIterator> bi(icu::BreakIterator::createCharacterInstance(
 223       icu::Locale::getDefault(), status));
 224   bi->setText(document_utext, status);
 225   DCHECK(U_SUCCESS(status));
 226
 227   // We build the snippet by iterating through the matches and then grabbing
 228   // context around each match.  If matches are near enough each other (within
 229   // kSnippetContext), we skip the "..." between them.
 230   string16 snippet;
 231   size_t start = 0;
 232   for (size_t i = 0; i < match_positions.size(); ++i) {
 233     // Some shorter names for the current match.
 234     const size_t match_start = match_positions[i].first;
 235     const size_t match_end = match_positions[i].second;
 236
 237     // Switch to DCHECK after debugging http://crbug.com/15261.
 238     CHECK(match_end > match_start);
 239     CHECK(match_end <= document.size());
 240
 241     // Add the context, if any, to show before the match.
 242     size_t context_start = match_start;
 243     MoveByNGraphemes(bi.get(), -kSnippetContext, &context_start);
 244     start = std::max(start, context_start);
 245     if (start < match_start) {
 246       if (start > 0)
 247         snippet += kEllipsis;
 248       // Switch to DCHECK after debugging http://crbug.com/15261.
 249       CHECK(start < document.size());
 250       snippet += UTF8ToUTF16(document.substr(start, match_start - start));
 251     }
 252
 253     // Add the match.
 254     const size_t first = snippet.size();
 255     snippet += UTF8ToUTF16(document.substr(match_start,
 256                                           match_end - match_start));
 257     matches_.push_back(std::make_pair(first, snippet.size()));
 258
 259     // Compute the context, if any, to show after the match.
 260     size_t end;
 261     // Check if the next match falls within our snippet window.
 262     if (i + 1 < match_positions.size() &&
 263         IsNextMatchWithinSnippetWindow(bi.get(), match_end,
 264             match_positions[i + 1].first)) {
 265       // Yes, it's within the window.  Make the end context extend just up
 266       // to the next match.
 267       end = match_positions[i + 1].first;
 268       // Switch to DCHECK after debugging http://crbug.com/15261.
 269       CHECK(end >= match_end);
 270       CHECK(end <= document.size());
 271       snippet += UTF8ToUTF16(document.substr(match_end, end - match_end));
 272     } else {
 273       // No, there's either no next match or the next match is too far away.
 274       end = match_end;
 275       MoveByNGraphemes(bi.get(), kSnippetContext, &end);
 276       // Switch to DCHECK after debugging http://crbug.com/15261.
 277       CHECK(end >= match_end);
 278       CHECK(end <= document.size());
 279       snippet += UTF8ToUTF16(document.substr(match_end, end - match_end));
 280       if (end < document.size())
 281         snippet += kEllipsis;
 282     }
 283     start = end;
 284
 285     // Stop here if we have enough snippet computed.
 286     if (snippet.size() >= kSnippetMaxLength)
 287       break;
 288   }
 289
 290   utext_close(document_utext);
 291   swap(text_, snippet);
 292 }
 293
 294 void Snippet::Swap(Snippet* other) {
 295   text_.swap(other->text_);
 296   matches_.swap(other->matches_);
 297 }