Fix emulator build error
[platform/framework/web/chromium-efl.git] / components / browsing_topics / browsing_topics_calculator.cc
1 // Copyright 2022 The Chromium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "components/browsing_topics/browsing_topics_calculator.h"
6
7 #include "base/containers/contains.h"
8 #include "base/metrics/histogram_functions.h"
9 #include "base/rand_util.h"
10 #include "base/ranges/algorithm.h"
11 #include "base/task/single_thread_task_runner.h"
12 #include "components/browsing_topics/annotator.h"
13 #include "components/browsing_topics/common/semantic_tree.h"
14 #include "components/history/core/browser/history_service.h"
15 #include "components/privacy_sandbox/canonical_topic.h"
16 #include "components/privacy_sandbox/privacy_sandbox_settings.h"
17 #include "content/public/browser/browsing_topics_site_data_manager.h"
18 #include "services/metrics/public/cpp/ukm_builders.h"
19 #include "services/metrics/public/cpp/ukm_recorder.h"
20 #include "third_party/blink/public/common/features.h"
21
22 namespace browsing_topics {
23
24 namespace {
25
26 // Return the max time among the three potential time boundaries:
27 // - `calculation_time` - `kBrowsingTopicsTimePeriodPerEpoch`.
28 // - Last epoch's calculation time.
29 // - `data_accessible_since`.
30 base::Time DeriveHistoryDataStartTime(
31     base::Time calculation_time,
32     const base::circular_deque<EpochTopics>& epochs,
33     base::Time data_accessible_since) {
34   base::Time start_time =
35       calculation_time -
36       blink::features::kBrowsingTopicsTimePeriodPerEpoch.Get();
37   if (!epochs.empty()) {
38     start_time = std::max(start_time, epochs.back().calculation_time());
39   }
40
41   return std::max(start_time, data_accessible_since);
42 }
43
44 // Return the max time among the three potential time boundaries:
45 // - `calculation_time` - `epochs_window` * `kBrowsingTopicsTimePeriodPerEpoch`.
46 // - The `epochs_window`-to-the-last epoch's calculation time.
47 // - `data_accessible_since`.
48 base::Time DeriveApiUsageContextDataStartTime(
49     base::Time calculation_time,
50     const base::circular_deque<EpochTopics>& epochs,
51     base::Time data_accessible_since) {
52   const size_t epochs_window =
53       blink::features::
54           kBrowsingTopicsNumberOfEpochsOfObservationDataToUseForFiltering.Get();
55   DCHECK_GT(epochs_window, 0u);
56
57   base::Time start_time =
58       calculation_time -
59       epochs_window * blink::features::kBrowsingTopicsTimePeriodPerEpoch.Get();
60
61   if (epochs.size() >= epochs_window) {
62     start_time = std::max(
63         start_time, epochs[epochs.size() - epochs_window].calculation_time());
64   }
65
66   return std::max(start_time, data_accessible_since);
67 }
68
69 void RecordCalculatorResultMetrics(
70     const BrowsingTopicsCalculator::CalculatorResultStatus& status,
71     const EpochTopics& epoch_topics) {
72   base::UmaHistogramEnumeration(
73       "BrowsingTopics.EpochTopicsCalculation.CalculatorResultStatus", status);
74
75   ukm::UkmRecorder* ukm_recorder = ukm::UkmRecorder::Get();
76   ukm::builders::BrowsingTopics_EpochTopicsCalculationResult builder(
77       ukm::NoURLSourceId());
78
79   if (status == BrowsingTopicsCalculator::CalculatorResultStatus::kSuccess) {
80     const std::vector<TopicAndDomains>& topics =
81         epoch_topics.top_topics_and_observing_domains();
82
83     if (topics.size() > 0 && topics[0].IsValid())
84       builder.SetTopTopic0(topics[0].topic().value());
85
86     if (topics.size() > 1 && topics[1].IsValid())
87       builder.SetTopTopic1(topics[1].topic().value());
88
89     if (topics.size() > 2 && topics[2].IsValid())
90       builder.SetTopTopic2(topics[2].topic().value());
91
92     if (topics.size() > 3 && topics[3].IsValid())
93       builder.SetTopTopic3(topics[3].topic().value());
94
95     if (topics.size() > 4 && topics[4].IsValid())
96       builder.SetTopTopic4(topics[4].topic().value());
97
98     builder.SetTaxonomyVersion(epoch_topics.taxonomy_version())
99         .SetModelVersion(epoch_topics.model_version())
100         .SetPaddedTopicsStartIndex(
101             epoch_topics.padded_top_topics_start_index());
102   }
103
104   builder.Record(ukm_recorder->Get());
105 }
106
107 // Derive the mapping from hosts to topics and the mapping from topics to hosts.
108 // Precondition: the annotation didn't fail in general (e.g. `ModelInfo` is
109 // valid).
110 void DeriveHostTopicsMapAndTopicHostsMap(
111     const std::vector<Annotation>& results,
112     std::map<HashedHost, std::set<Topic>>& host_topics_map,
113     std::map<Topic, std::set<HashedHost>>& topic_hosts_map) {
114   DCHECK(host_topics_map.empty());
115   DCHECK(topic_hosts_map.empty());
116
117   for (const Annotation& annotation : results) {
118     HashedHost host = HashMainFrameHostForStorage(annotation.input);
119
120     for (int32_t topic_id : annotation.topics) {
121       Topic topic = Topic(topic_id);
122
123       topic_hosts_map[topic].insert(host);
124       host_topics_map[host].insert(topic);
125     }
126   }
127 }
128
129 // For `topic`, derive the context domains that observed it. This is done by
130 // first getting the hosts about `topic` from `topic_hosts_map`, and
131 // for each site, get the callers (context domains) that were on that site and
132 // add the callers to a result set.
133 std::set<HashedDomain> GetTopicObservationDomains(
134     const Topic& topic,
135     const std::map<Topic, std::set<HashedHost>>& topic_hosts_map,
136     const std::map<HashedHost, std::vector<HashedDomain>>&
137         host_context_domains_map) {
138   std::set<HashedDomain> topic_observation_domains;
139
140   // If `topic` was padded, it may not exist in `topic_hosts_map`. In this
141   // case, return an empty set.
142   auto topic_it = topic_hosts_map.find(topic);
143   if (topic_it == topic_hosts_map.end())
144     return std::set<HashedDomain>();
145
146   const std::set<HashedHost>& hosts = topic_it->second;
147
148   for (const HashedHost& host : hosts) {
149     // `host` came from the history database, and it may not exist in the
150     // `host_context_domains_map` which came from the usage contexts
151     // database, due to e.g. per-context data deletion, database errors, etc.
152     // In this case, continue checking other hosts.
153     auto host_it = host_context_domains_map.find(host);
154     if (host_it == host_context_domains_map.end())
155       continue;
156
157     const std::vector<HashedDomain>& context_domains = host_it->second;
158
159     for (const HashedDomain& context_domain : context_domains) {
160       topic_observation_domains.insert(context_domain);
161
162       // To limit memory usage, cap the number of context domains to keep
163       // per-topic. The larger `HashedDomain`s will be kept. This is fair, as
164       // the hashing for context domains is per-user, so we are not
165       // prioritizing any domains in general.
166       if (topic_observation_domains.size() >
167           static_cast<size_t>(
168               blink::features::
169                   kBrowsingTopicsMaxNumberOfApiUsageContextDomainsToKeepPerTopic
170                       .Get())) {
171         topic_observation_domains.erase(topic_observation_domains.begin());
172       }
173     }
174   }
175
176   return topic_observation_domains;
177 }
178
179 }  // namespace
180
181 BrowsingTopicsCalculator::BrowsingTopicsCalculator(
182     privacy_sandbox::PrivacySandboxSettings* privacy_sandbox_settings,
183     history::HistoryService* history_service,
184     content::BrowsingTopicsSiteDataManager* site_data_manager,
185     Annotator* annotator,
186     const base::circular_deque<EpochTopics>& epochs,
187     bool is_manually_triggered,
188     CalculateCompletedCallback callback)
189     : privacy_sandbox_settings_(privacy_sandbox_settings),
190       history_service_(history_service),
191       site_data_manager_(site_data_manager),
192       annotator_(annotator),
193       calculate_completed_callback_(std::move(callback)),
194       calculation_time_(base::Time::Now()),
195       is_manually_triggered_(is_manually_triggered) {
196   history_data_start_time_ = DeriveHistoryDataStartTime(
197       calculation_time_, epochs,
198       privacy_sandbox_settings_->TopicsDataAccessibleSince());
199   api_usage_context_data_start_time_ = DeriveApiUsageContextDataStartTime(
200       calculation_time_, epochs,
201       privacy_sandbox_settings_->TopicsDataAccessibleSince());
202
203   // Continue asynchronously so that `calculate_completed_callback_` isn't
204   // called synchronously while `this` is being constructed.
205   base::SingleThreadTaskRunner::GetCurrentDefault()->PostTask(
206       FROM_HERE, base::BindOnce(&BrowsingTopicsCalculator::CheckCanCalculate,
207                                 weak_ptr_factory_.GetWeakPtr()));
208 }
209
210 BrowsingTopicsCalculator::~BrowsingTopicsCalculator() = default;
211
212 uint64_t BrowsingTopicsCalculator::GenerateRandUint64() {
213   return base::RandUint64();
214 }
215
216 void BrowsingTopicsCalculator::DeriveTopTopics(
217     const std::map<HashedHost, size_t>& history_hosts_count,
218     const std::map<HashedHost, std::set<Topic>>& host_topics_map,
219     std::vector<Topic>& top_topics,
220     size_t& padded_top_topics_start_index,
221     size_t& history_topics_count) {
222   DCHECK(top_topics.empty());
223   DCHECK_EQ(padded_top_topics_start_index, 0u);
224   DCHECK_EQ(history_topics_count, 0u);
225
226   // Derive the frequency of each topic, by summing up the frequencies of the
227   // associated hosts. TODO(yaoxia): consider applying inverse frequency of
228   // topics (https://github.com/jkarlin/topics/issues/42).
229   std::map<Topic, size_t> topics_count;
230   for (auto const& [host, host_count] : history_hosts_count) {
231     // A host wouldn't be found if there were no topics associated with it.
232     auto it = host_topics_map.find(host);
233     if (it == host_topics_map.end())
234       continue;
235
236     const std::set<Topic>& topics = it->second;
237     for (const Topic& topic : topics) {
238       topics_count[topic] += host_count;
239     }
240   }
241
242   history_topics_count = topics_count.size();
243
244   std::map<Topic, std::pair<bool, size_t>> topics_priorities_and_counts;
245   for (const auto& [topic, count] : topics_count) {
246     bool priority = privacy_sandbox_settings_->IsTopicPrioritized(
247         privacy_sandbox::CanonicalTopic(
248             topic, blink::features::kBrowsingTopicsTaxonomyVersion.Get()));
249     topics_priorities_and_counts[topic] = {priority, count};
250   }
251
252   // Get the top up to `kBrowsingTopicsNumberOfTopTopicsPerEpoch` topics,
253   // sorted by decreasing count.
254   using TopicsCountValue = std::pair<Topic, std::pair<bool, size_t>>;
255   std::vector<TopicsCountValue> top_topics_count(std::min(
256       static_cast<size_t>(
257           blink::features::kBrowsingTopicsNumberOfTopTopicsPerEpoch.Get()),
258       topics_priorities_and_counts.size()));
259
260   std::partial_sort_copy(topics_priorities_and_counts.begin(),
261                          topics_priorities_and_counts.end(),
262                          top_topics_count.begin(), top_topics_count.end(),
263                          [](auto& left, auto& right) {
264                            return (left.second.first > right.second.first) ||
265                                   (left.second.first == right.second.first &&
266                                    left.second.second > right.second.second);
267                          });
268
269   base::ranges::transform(top_topics_count, std::back_inserter(top_topics),
270                           &TopicsCountValue::first);
271
272   padded_top_topics_start_index = top_topics.size();
273
274   // Pad the top topics with distinct random topics until we have
275   // `kBrowsingTopicsNumberOfTopTopicsPerEpoch` topics.
276   SemanticTree semantic_tree;
277   while (top_topics.size() <
278          static_cast<size_t>(
279              blink::features::kBrowsingTopicsNumberOfTopTopicsPerEpoch.Get())) {
280     Topic padded_topic(0);
281
282     do {
283       int taxonomy_version =
284           blink::features::kBrowsingTopicsTaxonomyVersion.Get();
285       uint64_t padded_topic_index_decision = GenerateRandUint64();
286       padded_topic = semantic_tree.GetRandomTopic(taxonomy_version,
287                                                   padded_topic_index_decision);
288     } while (base::Contains(top_topics, padded_topic));
289
290     top_topics.emplace_back(std::move(padded_topic));
291   }
292 }
293
294 void BrowsingTopicsCalculator::CheckCanCalculate() {
295   if (!privacy_sandbox_settings_->IsTopicsAllowed()) {
296     OnCalculateCompleted(CalculatorResultStatus::kFailurePermissionDenied,
297                          EpochTopics(calculation_time_));
298     return;
299   }
300
301   // Get the the api usages context map (from the calling context domain to a
302   // set of history hosts) so that we can figure out which topics the APIs were
303   // called on.
304   site_data_manager_->GetBrowsingTopicsApiUsage(
305       /*begin_time=*/api_usage_context_data_start_time_,
306       /*end_time=*/calculation_time_,
307       base::BindOnce(&BrowsingTopicsCalculator::
308                          OnGetRecentBrowsingTopicsApiUsagesCompleted,
309                      weak_ptr_factory_.GetWeakPtr()));
310 }
311
312 void BrowsingTopicsCalculator::OnGetRecentBrowsingTopicsApiUsagesCompleted(
313     browsing_topics::ApiUsageContextQueryResult result) {
314   DCHECK(host_context_domains_map_.empty());
315
316   if (!result.success) {
317     OnCalculateCompleted(
318         CalculatorResultStatus::kFailureApiUsageContextQueryError,
319         EpochTopics(calculation_time_));
320     return;
321   }
322
323   for (const ApiUsageContext& usage_context : result.api_usage_contexts) {
324     host_context_domains_map_[usage_context.hashed_main_frame_host]
325         .emplace_back(usage_context.hashed_context_domain);
326   }
327
328   // `ApiUsageContext::hashed_main_frame_host` is a hashed number. To get the
329   // topic associated with it, we will need to match it against a set of raw
330   // hosts with topics. Thus, here we query the history with the larger time
331   // range (from `api_usage_context_data_start_time_` to `calculation_time_`) to
332   // get the raw hosts.
333   history::QueryOptions options;
334   options.begin_time = api_usage_context_data_start_time_;
335   options.end_time = calculation_time_;
336   options.duplicate_policy = history::QueryOptions::KEEP_ALL_DUPLICATES;
337
338   history_service_->QueryHistory(
339       std::u16string(), options,
340       base::BindOnce(
341           &BrowsingTopicsCalculator::OnGetRecentlyVisitedURLsCompleted,
342           weak_ptr_factory_.GetWeakPtr()),
343       &history_task_tracker_);
344 }
345
346 void BrowsingTopicsCalculator::OnGetRecentlyVisitedURLsCompleted(
347     history::QueryResults results) {
348   DCHECK(history_hosts_count_.empty());
349
350   std::set<std::string> raw_hosts;
351
352   for (const history::URLResult& url_result : results) {
353     if (!(url_result.content_annotations().annotation_flags &
354           history::VisitContentAnnotationFlag::kBrowsingTopicsEligible)) {
355       continue;
356     }
357
358     std::string raw_host = url_result.url().host();
359     raw_hosts.insert(raw_host);
360
361     if (url_result.visit_time() >= history_data_start_time_) {
362       HashedHost host = HashMainFrameHostForStorage(raw_host);
363       history_hosts_count_[host]++;
364     }
365   }
366
367   base::UmaHistogramCounts1000(
368       "BrowsingTopics.EpochTopicsCalculation.EligibleDistinctHistoryHostsCount",
369       history_hosts_count_.size());
370
371   // When the input is empty, we still want to wait for the model availability
372   // status to be known, before querying the model version. Thus we simply
373   // always call `NotifyWhenModelAvailable()` first. If the model
374   // availability status is already known, the function will be cheap and the
375   // callback will be synchronously called.
376   annotator_->NotifyWhenModelAvailable(base::BindOnce(
377       &BrowsingTopicsCalculator::OnRequestModelCompleted,
378       weak_ptr_factory_.GetWeakPtr(),
379       std::vector<std::string>(raw_hosts.begin(), raw_hosts.end())));
380 }
381
382 void BrowsingTopicsCalculator::OnRequestModelCompleted(
383     std::vector<std::string> raw_hosts) {
384   if (raw_hosts.empty()) {
385     OnGetTopicsForHostsCompleted(/*results=*/{});
386     return;
387   }
388
389   annotator_->BatchAnnotate(
390       base::BindOnce(&BrowsingTopicsCalculator::OnGetTopicsForHostsCompleted,
391                      weak_ptr_factory_.GetWeakPtr()),
392       raw_hosts);
393 }
394
395 void BrowsingTopicsCalculator::OnGetTopicsForHostsCompleted(
396     const std::vector<Annotation>& results) {
397   absl::optional<optimization_guide::ModelInfo> model_info =
398       annotator_->GetBrowsingTopicsModelInfo();
399
400   if (!model_info) {
401     OnCalculateCompleted(
402         CalculatorResultStatus::kFailureAnnotationExecutionError,
403         EpochTopics(calculation_time_));
404     return;
405   }
406
407   SemanticTree semantic_tree;
408   if (!semantic_tree.IsTaxonomySupported(
409           blink::features::kBrowsingTopicsTaxonomyVersion.Get())) {
410     OnCalculateCompleted(
411         CalculatorResultStatus::kFailureTaxonomyVersionNotSupportedInBinary,
412         EpochTopics(calculation_time_));
413     return;
414   }
415
416   const int64_t model_version = model_info->GetVersion();
417   DCHECK_GT(model_version, 0);
418
419   std::map<HashedHost, std::set<Topic>> host_topics_map;
420   std::map<Topic, std::set<HashedHost>> topic_hosts_map;
421   DeriveHostTopicsMapAndTopicHostsMap(results, host_topics_map,
422                                       topic_hosts_map);
423
424   std::vector<Topic> top_topics;
425   size_t padded_top_topics_start_index = 0u;
426   size_t history_topics_count = 0u;
427   DeriveTopTopics(history_hosts_count_, host_topics_map, top_topics,
428                   padded_top_topics_start_index, history_topics_count);
429
430   base::UmaHistogramCounts1000(
431       "BrowsingTopics.EpochTopicsCalculation.HistoryTopicsCount",
432       history_topics_count);
433
434   base::UmaHistogramCounts100(
435       "BrowsingTopics.EpochTopicsCalculation.TopTopicsCountBeforePadding",
436       padded_top_topics_start_index);
437
438   // For each top topic, derive the context domains that observed it
439   std::vector<TopicAndDomains> top_topics_and_observing_domains;
440
441   for (const Topic& topic : top_topics) {
442     if (!privacy_sandbox_settings_->IsTopicAllowed(
443             privacy_sandbox::CanonicalTopic(
444                 topic,
445                 blink::features::kBrowsingTopicsTaxonomyVersion.Get()))) {
446       top_topics_and_observing_domains.emplace_back(TopicAndDomains());
447       continue;
448     }
449
450     std::set<HashedDomain> topic_observation_domains =
451         GetTopicObservationDomains(topic, topic_hosts_map,
452                                    host_context_domains_map_);
453
454     // Calculate descendant topics + their observing context domains
455     std::vector<Topic> descendant_topics =
456         semantic_tree.GetDescendantTopics(topic);
457     for (const Topic& descendant_topic : descendant_topics) {
458       std::set<HashedDomain> descendant_topic_observation_domains =
459           GetTopicObservationDomains(descendant_topic, topic_hosts_map,
460                                      host_context_domains_map_);
461       topic_observation_domains.insert(
462           descendant_topic_observation_domains.begin(),
463           descendant_topic_observation_domains.end());
464     }
465
466     base::UmaHistogramCounts1000(
467         "BrowsingTopics.EpochTopicsCalculation."
468         "ObservationContextDomainsCountPerTopTopic",
469         topic_observation_domains.size());
470
471     top_topics_and_observing_domains.emplace_back(
472         TopicAndDomains(topic, std::move(topic_observation_domains)));
473   }
474
475   OnCalculateCompleted(
476       CalculatorResultStatus::kSuccess,
477       EpochTopics(std::move(top_topics_and_observing_domains),
478                   padded_top_topics_start_index, CurrentConfigVersion(),
479                   blink::features::kBrowsingTopicsTaxonomyVersion.Get(),
480                   model_version, calculation_time_, is_manually_triggered_));
481 }
482
483 void BrowsingTopicsCalculator::OnCalculateCompleted(
484     CalculatorResultStatus status,
485     EpochTopics epoch_topics) {
486   DCHECK(status != CalculatorResultStatus::kSuccess || !epoch_topics.empty());
487
488   RecordCalculatorResultMetrics(status, epoch_topics);
489
490   std::move(calculate_completed_callback_).Run(std::move(epoch_topics));
491
492   // Do not add code after this. BrowsingTopicsCalculator has been destroyed.
493 }
494
495 }  // namespace browsing_topics