1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "chrome/browser/history/visitsegment_database.h"
13 #include "base/logging.h"
14 #include "base/stl_util.h"
15 #include "base/strings/string_util.h"
16 #include "base/strings/utf_string_conversions.h"
17 #include "chrome/browser/history/page_usage_data.h"
18 #include "sql/statement.h"
19 #include "sql/transaction.h"
21 // The following tables are used to store url segment information.
25 // name A unique string to represent that segment. (URL derived)
26 // url_id ID of the url currently used to represent this segment.
30 // segment_id Corresponding segment id
31 // time_slot time stamp identifying for what day this entry is about
32 // visit_count Number of visit in the segment
37 VisitSegmentDatabase::VisitSegmentDatabase() {
40 VisitSegmentDatabase::~VisitSegmentDatabase() {
43 bool VisitSegmentDatabase::InitSegmentTables() {
45 if (!GetDB().DoesTableExist("segments")) {
46 if (!GetDB().Execute("CREATE TABLE segments ("
47 "id INTEGER PRIMARY KEY,"
49 "url_id INTEGER NON NULL)")) {
54 "CREATE INDEX segments_name ON segments(name)")) {
59 // This was added later, so we need to try to create it even if the table
61 if (!GetDB().Execute("CREATE INDEX IF NOT EXISTS segments_url_id ON "
65 // Segment usage table.
66 if (!GetDB().DoesTableExist("segment_usage")) {
67 if (!GetDB().Execute("CREATE TABLE segment_usage ("
68 "id INTEGER PRIMARY KEY,"
69 "segment_id INTEGER NOT NULL,"
70 "time_slot INTEGER NOT NULL,"
71 "visit_count INTEGER DEFAULT 0 NOT NULL)")) {
75 "CREATE INDEX segment_usage_time_slot_segment_id ON "
76 "segment_usage(time_slot, segment_id)")) {
81 // Added in a later version, so we always need to try to creat this index.
82 if (!GetDB().Execute("CREATE INDEX IF NOT EXISTS segments_usage_seg_id "
83 "ON segment_usage(segment_id)"))
89 bool VisitSegmentDatabase::DropSegmentTables() {
90 // Dropping the tables will implicitly delete the indices.
91 return GetDB().Execute("DROP TABLE segments") &&
92 GetDB().Execute("DROP TABLE segment_usage");
95 // Note: the segment name is derived from the URL but is not a URL. It is
96 // a string that can be easily recreated from various URLS. Maybe this should
97 // be an MD5 to limit the length.
100 std::string VisitSegmentDatabase::ComputeSegmentName(const GURL& url) {
101 // TODO(brettw) this should probably use the registry controlled
103 GURL::Replacements r;
104 const char kWWWDot[] = "www.";
105 const int kWWWDotLen = arraysize(kWWWDot) - 1;
107 std::string host = url.host();
108 const char* host_c = host.c_str();
109 // Remove www. to avoid some dups.
110 if (static_cast<int>(host.size()) > kWWWDotLen &&
111 LowerCaseEqualsASCII(host_c, host_c + kWWWDotLen, kWWWDot)) {
112 r.SetHost(host.c_str(),
113 url::Component(kWWWDotLen,
114 static_cast<int>(host.size()) - kWWWDotLen));
116 // Remove other stuff we don't want.
123 return url.ReplaceComponents(r).spec();
126 SegmentID VisitSegmentDatabase::GetSegmentNamed(
127 const std::string& segment_name) {
128 sql::Statement statement(GetDB().GetCachedStatement(SQL_FROM_HERE,
129 "SELECT id FROM segments WHERE name = ?"));
130 statement.BindString(0, segment_name);
132 if (statement.Step())
133 return statement.ColumnInt64(0);
137 bool VisitSegmentDatabase::UpdateSegmentRepresentationURL(SegmentID segment_id,
139 sql::Statement statement(GetDB().GetCachedStatement(SQL_FROM_HERE,
140 "UPDATE segments SET url_id = ? WHERE id = ?"));
141 statement.BindInt64(0, url_id);
142 statement.BindInt64(1, segment_id);
144 return statement.Run();
147 URLID VisitSegmentDatabase::GetSegmentRepresentationURL(SegmentID segment_id) {
148 sql::Statement statement(GetDB().GetCachedStatement(SQL_FROM_HERE,
149 "SELECT url_id FROM segments WHERE id = ?"));
150 statement.BindInt64(0, segment_id);
152 if (statement.Step())
153 return statement.ColumnInt64(0);
157 SegmentID VisitSegmentDatabase::CreateSegment(URLID url_id,
158 const std::string& segment_name) {
159 sql::Statement statement(GetDB().GetCachedStatement(SQL_FROM_HERE,
160 "INSERT INTO segments (name, url_id) VALUES (?,?)"));
161 statement.BindString(0, segment_name);
162 statement.BindInt64(1, url_id);
165 return GetDB().GetLastInsertRowId();
169 bool VisitSegmentDatabase::IncreaseSegmentVisitCount(SegmentID segment_id,
172 base::Time t = ts.LocalMidnight();
174 sql::Statement select(GetDB().GetCachedStatement(SQL_FROM_HERE,
175 "SELECT id, visit_count FROM segment_usage "
176 "WHERE time_slot = ? AND segment_id = ?"));
177 select.BindInt64(0, t.ToInternalValue());
178 select.BindInt64(1, segment_id);
180 if (!select.is_valid())
184 sql::Statement update(GetDB().GetCachedStatement(SQL_FROM_HERE,
185 "UPDATE segment_usage SET visit_count = ? WHERE id = ?"));
186 update.BindInt64(0, select.ColumnInt64(1) + static_cast<int64>(amount));
187 update.BindInt64(1, select.ColumnInt64(0));
191 sql::Statement insert(GetDB().GetCachedStatement(SQL_FROM_HERE,
192 "INSERT INTO segment_usage "
193 "(segment_id, time_slot, visit_count) VALUES (?, ?, ?)"));
194 insert.BindInt64(0, segment_id);
195 insert.BindInt64(1, t.ToInternalValue());
196 insert.BindInt64(2, static_cast<int64>(amount));
202 void VisitSegmentDatabase::QuerySegmentUsage(
203 base::Time from_time,
204 int max_result_count,
205 std::vector<PageUsageData*>* results) {
206 // This function gathers the highest-ranked segments in two queries.
207 // The first gathers scores for all segments.
208 // The second gathers segment data (url, title, etc.) for the highest-ranked
211 // Gather all the segment scores.
212 sql::Statement statement(GetDB().GetCachedStatement(SQL_FROM_HERE,
213 "SELECT segment_id, time_slot, visit_count "
214 "FROM segment_usage WHERE time_slot >= ? "
215 "ORDER BY segment_id"));
216 if (!statement.is_valid())
219 base::Time ts = from_time.LocalMidnight();
220 statement.BindInt64(0, ts.ToInternalValue());
222 base::Time now = base::Time::Now();
223 SegmentID last_segment_id = 0;
224 PageUsageData* pud = NULL;
226 while (statement.Step()) {
227 SegmentID segment_id = statement.ColumnInt64(0);
228 if (segment_id != last_segment_id) {
230 pud->SetScore(score);
231 results->push_back(pud);
234 pud = new PageUsageData(segment_id);
236 last_segment_id = segment_id;
239 base::Time timeslot =
240 base::Time::FromInternalValue(statement.ColumnInt64(1));
241 int visit_count = statement.ColumnInt(2);
242 int days_ago = (now - timeslot).InDays();
244 // Score for this day in isolation.
245 float day_visits_score = 1.0f + log(static_cast<float>(visit_count));
246 // Recent visits count more than historical ones, so we multiply in a boost
247 // related to how long ago this day was.
248 // This boost is a curve that smoothly goes through these values:
249 // Today gets 3x, a week ago 2x, three weeks ago 1.5x, falling off to 1x
250 // at the limit of how far we reach into the past.
251 float recency_boost = 1.0f + (2.0f * (1.0f / (1.0f + days_ago/7.0f)));
252 score += recency_boost * day_visits_score;
256 pud->SetScore(score);
257 results->push_back(pud);
260 // Limit to the top kResultCount results.
261 std::sort(results->begin(), results->end(), PageUsageData::Predicate);
262 if (static_cast<int>(results->size()) > max_result_count) {
263 STLDeleteContainerPointers(results->begin() + max_result_count,
265 results->resize(max_result_count);
268 // Now fetch the details about the entries we care about.
269 sql::Statement statement2(GetDB().GetCachedStatement(SQL_FROM_HERE,
270 "SELECT urls.url, urls.title FROM urls "
271 "JOIN segments ON segments.url_id = urls.id "
272 "WHERE segments.id = ?"));
274 if (!statement2.is_valid())
277 for (size_t i = 0; i < results->size(); ++i) {
278 PageUsageData* pud = (*results)[i];
279 statement2.BindInt64(0, pud->GetID());
280 if (statement2.Step()) {
281 pud->SetURL(GURL(statement2.ColumnString(0)));
282 pud->SetTitle(statement2.ColumnString16(1));
284 statement2.Reset(true);
288 bool VisitSegmentDatabase::DeleteSegmentData(base::Time older_than) {
289 sql::Statement statement(GetDB().GetCachedStatement(SQL_FROM_HERE,
290 "DELETE FROM segment_usage WHERE time_slot < ?"));
291 statement.BindInt64(0, older_than.LocalMidnight().ToInternalValue());
293 return statement.Run();
296 bool VisitSegmentDatabase::DeleteSegmentForURL(URLID url_id) {
297 sql::Statement delete_usage(GetDB().GetCachedStatement(SQL_FROM_HERE,
298 "DELETE FROM segment_usage WHERE segment_id IN "
299 "(SELECT id FROM segments WHERE url_id = ?)"));
300 delete_usage.BindInt64(0, url_id);
302 if (!delete_usage.Run())
305 sql::Statement delete_seg(GetDB().GetCachedStatement(SQL_FROM_HERE,
306 "DELETE FROM segments WHERE url_id = ?"));
307 delete_seg.BindInt64(0, url_id);
309 return delete_seg.Run();
312 bool VisitSegmentDatabase::MigratePresentationIndex() {
313 sql::Transaction transaction(&GetDB());
314 return transaction.Begin() &&
315 GetDB().Execute("DROP TABLE presentation") &&
316 GetDB().Execute("CREATE TABLE segments_tmp ("
317 "id INTEGER PRIMARY KEY,"
319 "url_id INTEGER NON NULL)") &&
320 GetDB().Execute("INSERT INTO segments_tmp SELECT "
321 "id, name, url_id FROM segments") &&
322 GetDB().Execute("DROP TABLE segments") &&
323 GetDB().Execute("ALTER TABLE segments_tmp RENAME TO segments") &&
324 transaction.Commit();
327 } // namespace history