1 // Copyright 2022 The Chromium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "components/browsing_topics/browsing_topics_calculator.h"
7 #include "base/containers/contains.h"
8 #include "base/metrics/histogram_functions.h"
9 #include "base/rand_util.h"
10 #include "base/ranges/algorithm.h"
11 #include "base/task/single_thread_task_runner.h"
12 #include "components/browsing_topics/annotator.h"
13 #include "components/browsing_topics/common/semantic_tree.h"
14 #include "components/history/core/browser/history_service.h"
15 #include "components/privacy_sandbox/canonical_topic.h"
16 #include "components/privacy_sandbox/privacy_sandbox_settings.h"
17 #include "content/public/browser/browsing_topics_site_data_manager.h"
18 #include "services/metrics/public/cpp/ukm_builders.h"
19 #include "services/metrics/public/cpp/ukm_recorder.h"
20 #include "third_party/blink/public/common/features.h"
22 namespace browsing_topics {
26 // Return the max time among the three potential time boundaries:
27 // - `calculation_time` - `kBrowsingTopicsTimePeriodPerEpoch`.
28 // - Last epoch's calculation time.
29 // - `data_accessible_since`.
30 base::Time DeriveHistoryDataStartTime(
31 base::Time calculation_time,
32 const base::circular_deque<EpochTopics>& epochs,
33 base::Time data_accessible_since) {
34 base::Time start_time =
36 blink::features::kBrowsingTopicsTimePeriodPerEpoch.Get();
37 if (!epochs.empty()) {
38 start_time = std::max(start_time, epochs.back().calculation_time());
41 return std::max(start_time, data_accessible_since);
44 // Return the max time among the three potential time boundaries:
45 // - `calculation_time` - `epochs_window` * `kBrowsingTopicsTimePeriodPerEpoch`.
46 // - The `epochs_window`-to-the-last epoch's calculation time.
47 // - `data_accessible_since`.
48 base::Time DeriveApiUsageContextDataStartTime(
49 base::Time calculation_time,
50 const base::circular_deque<EpochTopics>& epochs,
51 base::Time data_accessible_since) {
52 const size_t epochs_window =
54 kBrowsingTopicsNumberOfEpochsOfObservationDataToUseForFiltering.Get();
55 DCHECK_GT(epochs_window, 0u);
57 base::Time start_time =
59 epochs_window * blink::features::kBrowsingTopicsTimePeriodPerEpoch.Get();
61 if (epochs.size() >= epochs_window) {
62 start_time = std::max(
63 start_time, epochs[epochs.size() - epochs_window].calculation_time());
66 return std::max(start_time, data_accessible_since);
69 void RecordCalculatorResultMetrics(
70 const BrowsingTopicsCalculator::CalculatorResultStatus& status,
71 const EpochTopics& epoch_topics) {
72 base::UmaHistogramEnumeration(
73 "BrowsingTopics.EpochTopicsCalculation.CalculatorResultStatus", status);
75 ukm::UkmRecorder* ukm_recorder = ukm::UkmRecorder::Get();
76 ukm::builders::BrowsingTopics_EpochTopicsCalculationResult builder(
77 ukm::NoURLSourceId());
79 if (status == BrowsingTopicsCalculator::CalculatorResultStatus::kSuccess) {
80 const std::vector<TopicAndDomains>& topics =
81 epoch_topics.top_topics_and_observing_domains();
83 if (topics.size() > 0 && topics[0].IsValid())
84 builder.SetTopTopic0(topics[0].topic().value());
86 if (topics.size() > 1 && topics[1].IsValid())
87 builder.SetTopTopic1(topics[1].topic().value());
89 if (topics.size() > 2 && topics[2].IsValid())
90 builder.SetTopTopic2(topics[2].topic().value());
92 if (topics.size() > 3 && topics[3].IsValid())
93 builder.SetTopTopic3(topics[3].topic().value());
95 if (topics.size() > 4 && topics[4].IsValid())
96 builder.SetTopTopic4(topics[4].topic().value());
98 builder.SetTaxonomyVersion(epoch_topics.taxonomy_version())
99 .SetModelVersion(epoch_topics.model_version())
100 .SetPaddedTopicsStartIndex(
101 epoch_topics.padded_top_topics_start_index());
104 builder.Record(ukm_recorder->Get());
107 // Derive the mapping from hosts to topics and the mapping from topics to hosts.
108 // Precondition: the annotation didn't fail in general (e.g. `ModelInfo` is
110 void DeriveHostTopicsMapAndTopicHostsMap(
111 const std::vector<Annotation>& results,
112 std::map<HashedHost, std::set<Topic>>& host_topics_map,
113 std::map<Topic, std::set<HashedHost>>& topic_hosts_map) {
114 DCHECK(host_topics_map.empty());
115 DCHECK(topic_hosts_map.empty());
117 for (const Annotation& annotation : results) {
118 HashedHost host = HashMainFrameHostForStorage(annotation.input);
120 for (int32_t topic_id : annotation.topics) {
121 Topic topic = Topic(topic_id);
123 topic_hosts_map[topic].insert(host);
124 host_topics_map[host].insert(topic);
129 // For `topic`, derive the context domains that observed it. This is done by
130 // first getting the hosts about `topic` from `topic_hosts_map`, and
131 // for each site, get the callers (context domains) that were on that site and
132 // add the callers to a result set.
133 std::set<HashedDomain> GetTopicObservationDomains(
135 const std::map<Topic, std::set<HashedHost>>& topic_hosts_map,
136 const std::map<HashedHost, std::vector<HashedDomain>>&
137 host_context_domains_map) {
138 std::set<HashedDomain> topic_observation_domains;
140 // If `topic` was padded, it may not exist in `topic_hosts_map`. In this
141 // case, return an empty set.
142 auto topic_it = topic_hosts_map.find(topic);
143 if (topic_it == topic_hosts_map.end())
144 return std::set<HashedDomain>();
146 const std::set<HashedHost>& hosts = topic_it->second;
148 for (const HashedHost& host : hosts) {
149 // `host` came from the history database, and it may not exist in the
150 // `host_context_domains_map` which came from the usage contexts
151 // database, due to e.g. per-context data deletion, database errors, etc.
152 // In this case, continue checking other hosts.
153 auto host_it = host_context_domains_map.find(host);
154 if (host_it == host_context_domains_map.end())
157 const std::vector<HashedDomain>& context_domains = host_it->second;
159 for (const HashedDomain& context_domain : context_domains) {
160 topic_observation_domains.insert(context_domain);
162 // To limit memory usage, cap the number of context domains to keep
163 // per-topic. The larger `HashedDomain`s will be kept. This is fair, as
164 // the hashing for context domains is per-user, so we are not
165 // prioritizing any domains in general.
166 if (topic_observation_domains.size() >
169 kBrowsingTopicsMaxNumberOfApiUsageContextDomainsToKeepPerTopic
171 topic_observation_domains.erase(topic_observation_domains.begin());
176 return topic_observation_domains;
181 BrowsingTopicsCalculator::BrowsingTopicsCalculator(
182 privacy_sandbox::PrivacySandboxSettings* privacy_sandbox_settings,
183 history::HistoryService* history_service,
184 content::BrowsingTopicsSiteDataManager* site_data_manager,
185 Annotator* annotator,
186 const base::circular_deque<EpochTopics>& epochs,
187 bool is_manually_triggered,
188 CalculateCompletedCallback callback)
189 : privacy_sandbox_settings_(privacy_sandbox_settings),
190 history_service_(history_service),
191 site_data_manager_(site_data_manager),
192 annotator_(annotator),
193 calculate_completed_callback_(std::move(callback)),
194 calculation_time_(base::Time::Now()),
195 is_manually_triggered_(is_manually_triggered) {
196 history_data_start_time_ = DeriveHistoryDataStartTime(
197 calculation_time_, epochs,
198 privacy_sandbox_settings_->TopicsDataAccessibleSince());
199 api_usage_context_data_start_time_ = DeriveApiUsageContextDataStartTime(
200 calculation_time_, epochs,
201 privacy_sandbox_settings_->TopicsDataAccessibleSince());
203 // Continue asynchronously so that `calculate_completed_callback_` isn't
204 // called synchronously while `this` is being constructed.
205 base::SingleThreadTaskRunner::GetCurrentDefault()->PostTask(
206 FROM_HERE, base::BindOnce(&BrowsingTopicsCalculator::CheckCanCalculate,
207 weak_ptr_factory_.GetWeakPtr()));
210 BrowsingTopicsCalculator::~BrowsingTopicsCalculator() = default;
212 uint64_t BrowsingTopicsCalculator::GenerateRandUint64() {
213 return base::RandUint64();
216 void BrowsingTopicsCalculator::DeriveTopTopics(
217 const std::map<HashedHost, size_t>& history_hosts_count,
218 const std::map<HashedHost, std::set<Topic>>& host_topics_map,
219 std::vector<Topic>& top_topics,
220 size_t& padded_top_topics_start_index,
221 size_t& history_topics_count) {
222 DCHECK(top_topics.empty());
223 DCHECK_EQ(padded_top_topics_start_index, 0u);
224 DCHECK_EQ(history_topics_count, 0u);
226 // Derive the frequency of each topic, by summing up the frequencies of the
227 // associated hosts. TODO(yaoxia): consider applying inverse frequency of
228 // topics (https://github.com/jkarlin/topics/issues/42).
229 std::map<Topic, size_t> topics_count;
230 for (auto const& [host, host_count] : history_hosts_count) {
231 // A host wouldn't be found if there were no topics associated with it.
232 auto it = host_topics_map.find(host);
233 if (it == host_topics_map.end())
236 const std::set<Topic>& topics = it->second;
237 for (const Topic& topic : topics) {
238 topics_count[topic] += host_count;
242 history_topics_count = topics_count.size();
244 std::map<Topic, std::pair<bool, size_t>> topics_priorities_and_counts;
245 for (const auto& [topic, count] : topics_count) {
246 bool priority = privacy_sandbox_settings_->IsTopicPrioritized(
247 privacy_sandbox::CanonicalTopic(
248 topic, blink::features::kBrowsingTopicsTaxonomyVersion.Get()));
249 topics_priorities_and_counts[topic] = {priority, count};
252 // Get the top up to `kBrowsingTopicsNumberOfTopTopicsPerEpoch` topics,
253 // sorted by decreasing count.
254 using TopicsCountValue = std::pair<Topic, std::pair<bool, size_t>>;
255 std::vector<TopicsCountValue> top_topics_count(std::min(
257 blink::features::kBrowsingTopicsNumberOfTopTopicsPerEpoch.Get()),
258 topics_priorities_and_counts.size()));
260 std::partial_sort_copy(topics_priorities_and_counts.begin(),
261 topics_priorities_and_counts.end(),
262 top_topics_count.begin(), top_topics_count.end(),
263 [](auto& left, auto& right) {
264 return (left.second.first > right.second.first) ||
265 (left.second.first == right.second.first &&
266 left.second.second > right.second.second);
269 base::ranges::transform(top_topics_count, std::back_inserter(top_topics),
270 &TopicsCountValue::first);
272 padded_top_topics_start_index = top_topics.size();
274 // Pad the top topics with distinct random topics until we have
275 // `kBrowsingTopicsNumberOfTopTopicsPerEpoch` topics.
276 SemanticTree semantic_tree;
277 while (top_topics.size() <
279 blink::features::kBrowsingTopicsNumberOfTopTopicsPerEpoch.Get())) {
280 Topic padded_topic(0);
283 int taxonomy_version =
284 blink::features::kBrowsingTopicsTaxonomyVersion.Get();
285 uint64_t padded_topic_index_decision = GenerateRandUint64();
286 padded_topic = semantic_tree.GetRandomTopic(taxonomy_version,
287 padded_topic_index_decision);
288 } while (base::Contains(top_topics, padded_topic));
290 top_topics.emplace_back(std::move(padded_topic));
294 void BrowsingTopicsCalculator::CheckCanCalculate() {
295 if (!privacy_sandbox_settings_->IsTopicsAllowed()) {
296 OnCalculateCompleted(CalculatorResultStatus::kFailurePermissionDenied,
297 EpochTopics(calculation_time_));
301 // Get the the api usages context map (from the calling context domain to a
302 // set of history hosts) so that we can figure out which topics the APIs were
304 site_data_manager_->GetBrowsingTopicsApiUsage(
305 /*begin_time=*/api_usage_context_data_start_time_,
306 /*end_time=*/calculation_time_,
307 base::BindOnce(&BrowsingTopicsCalculator::
308 OnGetRecentBrowsingTopicsApiUsagesCompleted,
309 weak_ptr_factory_.GetWeakPtr()));
312 void BrowsingTopicsCalculator::OnGetRecentBrowsingTopicsApiUsagesCompleted(
313 browsing_topics::ApiUsageContextQueryResult result) {
314 DCHECK(host_context_domains_map_.empty());
316 if (!result.success) {
317 OnCalculateCompleted(
318 CalculatorResultStatus::kFailureApiUsageContextQueryError,
319 EpochTopics(calculation_time_));
323 for (const ApiUsageContext& usage_context : result.api_usage_contexts) {
324 host_context_domains_map_[usage_context.hashed_main_frame_host]
325 .emplace_back(usage_context.hashed_context_domain);
328 // `ApiUsageContext::hashed_main_frame_host` is a hashed number. To get the
329 // topic associated with it, we will need to match it against a set of raw
330 // hosts with topics. Thus, here we query the history with the larger time
331 // range (from `api_usage_context_data_start_time_` to `calculation_time_`) to
332 // get the raw hosts.
333 history::QueryOptions options;
334 options.begin_time = api_usage_context_data_start_time_;
335 options.end_time = calculation_time_;
336 options.duplicate_policy = history::QueryOptions::KEEP_ALL_DUPLICATES;
338 history_service_->QueryHistory(
339 std::u16string(), options,
341 &BrowsingTopicsCalculator::OnGetRecentlyVisitedURLsCompleted,
342 weak_ptr_factory_.GetWeakPtr()),
343 &history_task_tracker_);
346 void BrowsingTopicsCalculator::OnGetRecentlyVisitedURLsCompleted(
347 history::QueryResults results) {
348 DCHECK(history_hosts_count_.empty());
350 std::set<std::string> raw_hosts;
352 for (const history::URLResult& url_result : results) {
353 if (!(url_result.content_annotations().annotation_flags &
354 history::VisitContentAnnotationFlag::kBrowsingTopicsEligible)) {
358 std::string raw_host = url_result.url().host();
359 raw_hosts.insert(raw_host);
361 if (url_result.visit_time() >= history_data_start_time_) {
362 HashedHost host = HashMainFrameHostForStorage(raw_host);
363 history_hosts_count_[host]++;
367 base::UmaHistogramCounts1000(
368 "BrowsingTopics.EpochTopicsCalculation.EligibleDistinctHistoryHostsCount",
369 history_hosts_count_.size());
371 // When the input is empty, we still want to wait for the model availability
372 // status to be known, before querying the model version. Thus we simply
373 // always call `NotifyWhenModelAvailable()` first. If the model
374 // availability status is already known, the function will be cheap and the
375 // callback will be synchronously called.
376 annotator_->NotifyWhenModelAvailable(base::BindOnce(
377 &BrowsingTopicsCalculator::OnRequestModelCompleted,
378 weak_ptr_factory_.GetWeakPtr(),
379 std::vector<std::string>(raw_hosts.begin(), raw_hosts.end())));
382 void BrowsingTopicsCalculator::OnRequestModelCompleted(
383 std::vector<std::string> raw_hosts) {
384 if (raw_hosts.empty()) {
385 OnGetTopicsForHostsCompleted(/*results=*/{});
389 annotator_->BatchAnnotate(
390 base::BindOnce(&BrowsingTopicsCalculator::OnGetTopicsForHostsCompleted,
391 weak_ptr_factory_.GetWeakPtr()),
395 void BrowsingTopicsCalculator::OnGetTopicsForHostsCompleted(
396 const std::vector<Annotation>& results) {
397 absl::optional<optimization_guide::ModelInfo> model_info =
398 annotator_->GetBrowsingTopicsModelInfo();
401 OnCalculateCompleted(
402 CalculatorResultStatus::kFailureAnnotationExecutionError,
403 EpochTopics(calculation_time_));
407 SemanticTree semantic_tree;
408 if (!semantic_tree.IsTaxonomySupported(
409 blink::features::kBrowsingTopicsTaxonomyVersion.Get())) {
410 OnCalculateCompleted(
411 CalculatorResultStatus::kFailureTaxonomyVersionNotSupportedInBinary,
412 EpochTopics(calculation_time_));
416 const int64_t model_version = model_info->GetVersion();
417 DCHECK_GT(model_version, 0);
419 std::map<HashedHost, std::set<Topic>> host_topics_map;
420 std::map<Topic, std::set<HashedHost>> topic_hosts_map;
421 DeriveHostTopicsMapAndTopicHostsMap(results, host_topics_map,
424 std::vector<Topic> top_topics;
425 size_t padded_top_topics_start_index = 0u;
426 size_t history_topics_count = 0u;
427 DeriveTopTopics(history_hosts_count_, host_topics_map, top_topics,
428 padded_top_topics_start_index, history_topics_count);
430 base::UmaHistogramCounts1000(
431 "BrowsingTopics.EpochTopicsCalculation.HistoryTopicsCount",
432 history_topics_count);
434 base::UmaHistogramCounts100(
435 "BrowsingTopics.EpochTopicsCalculation.TopTopicsCountBeforePadding",
436 padded_top_topics_start_index);
438 // For each top topic, derive the context domains that observed it
439 std::vector<TopicAndDomains> top_topics_and_observing_domains;
441 for (const Topic& topic : top_topics) {
442 if (!privacy_sandbox_settings_->IsTopicAllowed(
443 privacy_sandbox::CanonicalTopic(
445 blink::features::kBrowsingTopicsTaxonomyVersion.Get()))) {
446 top_topics_and_observing_domains.emplace_back(TopicAndDomains());
450 std::set<HashedDomain> topic_observation_domains =
451 GetTopicObservationDomains(topic, topic_hosts_map,
452 host_context_domains_map_);
454 // Calculate descendant topics + their observing context domains
455 std::vector<Topic> descendant_topics =
456 semantic_tree.GetDescendantTopics(topic);
457 for (const Topic& descendant_topic : descendant_topics) {
458 std::set<HashedDomain> descendant_topic_observation_domains =
459 GetTopicObservationDomains(descendant_topic, topic_hosts_map,
460 host_context_domains_map_);
461 topic_observation_domains.insert(
462 descendant_topic_observation_domains.begin(),
463 descendant_topic_observation_domains.end());
466 base::UmaHistogramCounts1000(
467 "BrowsingTopics.EpochTopicsCalculation."
468 "ObservationContextDomainsCountPerTopTopic",
469 topic_observation_domains.size());
471 top_topics_and_observing_domains.emplace_back(
472 TopicAndDomains(topic, std::move(topic_observation_domains)));
475 OnCalculateCompleted(
476 CalculatorResultStatus::kSuccess,
477 EpochTopics(std::move(top_topics_and_observing_domains),
478 padded_top_topics_start_index, CurrentConfigVersion(),
479 blink::features::kBrowsingTopicsTaxonomyVersion.Get(),
480 model_version, calculation_time_, is_manually_triggered_));
483 void BrowsingTopicsCalculator::OnCalculateCompleted(
484 CalculatorResultStatus status,
485 EpochTopics epoch_topics) {
486 DCHECK(status != CalculatorResultStatus::kSuccess || !epoch_topics.empty());
488 RecordCalculatorResultMetrics(status, epoch_topics);
490 std::move(calculate_completed_callback_).Run(std::move(epoch_topics));
492 // Do not add code after this. BrowsingTopicsCalculator has been destroyed.
495 } // namespace browsing_topics