1 // Copyright 2017 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "components/ukm/ukm_recorder_impl.h"
12 #include "base/feature_list.h"
13 #include "base/metrics/field_trial.h"
14 #include "base/metrics/field_trial_params.h"
15 #include "base/metrics/histogram_macros.h"
16 #include "base/metrics/metrics_hashes.h"
17 #include "base/rand_util.h"
18 #include "base/strings/string_number_conversions.h"
19 #include "base/strings/string_split.h"
20 #include "components/variations/variations_associated_data.h"
21 #include "services/metrics/public/cpp/ukm_decode.h"
22 #include "services/metrics/public/cpp/ukm_source.h"
23 #include "services/metrics/public/cpp/ukm_source_id.h"
24 #include "third_party/metrics_proto/ukm/entry.pb.h"
25 #include "third_party/metrics_proto/ukm/report.pb.h"
26 #include "third_party/metrics_proto/ukm/source.pb.h"
33 // Note: kChromeUIScheme is defined in content, which this code can't
34 // depend on - since it's used by iOS too. kExtensionScheme is defined
35 // in extensions which also isn't always available here. kAppScheme
36 // will be defined in code that isn't available here.
37 const char kChromeUIScheme[] = "chrome";
38 const char kExtensionScheme[] = "chrome-extension";
39 const char kAppScheme[] = "app";
41 const base::Feature kUkmSamplingRateFeature{"UkmSamplingRate",
42 base::FEATURE_DISABLED_BY_DEFAULT};
44 // Gets the list of whitelisted Entries as string. Format is a comma separated
45 // list of Entry names (as strings).
46 std::string GetWhitelistEntries() {
47 return base::GetFieldTrialParamValueByFeature(kUkmFeature,
51 bool IsWhitelistedSourceId(SourceId source_id) {
52 return GetSourceIdType(source_id) == SourceIdType::NAVIGATION_ID ||
53 GetSourceIdType(source_id) == SourceIdType::APP_ID;
56 // Gets the maximum number of Sources we'll keep in memory before discarding any
57 // new ones being added.
58 size_t GetMaxSources() {
59 constexpr size_t kDefaultMaxSources = 500;
60 return static_cast<size_t>(base::GetFieldTrialParamByFeatureAsInt(
61 kUkmFeature, "MaxSources", kDefaultMaxSources));
64 // Gets the maximum number of unreferenced Sources kept after purging sources
65 // that were added to the log.
66 size_t GetMaxKeptSources() {
67 constexpr size_t kDefaultMaxKeptSources = 100;
68 return static_cast<size_t>(base::GetFieldTrialParamByFeatureAsInt(
69 kUkmFeature, "MaxKeptSources", kDefaultMaxKeptSources));
72 // Gets the maximum number of Entries we'll keep in memory before discarding any
73 // new ones being added.
74 size_t GetMaxEntries() {
75 constexpr size_t kDefaultMaxEntries = 5000;
76 return static_cast<size_t>(base::GetFieldTrialParamByFeatureAsInt(
77 kUkmFeature, "MaxEntries", kDefaultMaxEntries));
80 // Returns whether |url| has one of the schemes supported for logging to UKM.
81 // URLs with other schemes will not be logged.
82 bool HasSupportedScheme(const GURL& url) {
83 return url.SchemeIsHTTPOrHTTPS() || url.SchemeIs(url::kFtpScheme) ||
84 url.SchemeIs(url::kAboutScheme) || url.SchemeIs(kChromeUIScheme) ||
85 url.SchemeIs(kExtensionScheme) || url.SchemeIs(kAppScheme);
88 // True if we should record the initial_url field of the UKM Source proto.
89 bool ShouldRecordInitialUrl() {
90 return base::GetFieldTrialParamByFeatureAsBool(kUkmFeature,
91 "RecordInitialUrl", false);
94 enum class DroppedDataReason {
96 RECORDING_DISABLED = 1,
99 UNSUPPORTED_URL_SCHEME = 4,
101 EXTENSION_URLS_DISABLED = 6,
102 EXTENSION_NOT_SYNCED = 7,
105 NUM_DROPPED_DATA_REASONS
108 void RecordDroppedSource(DroppedDataReason reason) {
109 UMA_HISTOGRAM_ENUMERATION(
110 "UKM.Sources.Dropped", static_cast<int>(reason),
111 static_cast<int>(DroppedDataReason::NUM_DROPPED_DATA_REASONS));
114 void RecordDroppedEntry(DroppedDataReason reason) {
115 UMA_HISTOGRAM_ENUMERATION(
116 "UKM.Entries.Dropped", static_cast<int>(reason),
117 static_cast<int>(DroppedDataReason::NUM_DROPPED_DATA_REASONS));
120 void StoreEntryProto(const mojom::UkmEntry& in, Entry* out) {
121 DCHECK(!out->has_source_id());
122 DCHECK(!out->has_event_hash());
124 out->set_source_id(in.source_id);
125 out->set_event_hash(in.event_hash);
126 for (const auto& metric : in.metrics) {
127 Entry::Metric* proto_metric = out->add_metrics();
128 proto_metric->set_metric_hash(metric.first);
129 proto_metric->set_value(metric.second);
133 GURL SanitizeURL(const GURL& url) {
134 GURL::Replacements remove_params;
135 remove_params.ClearUsername();
136 remove_params.ClearPassword();
137 // chrome:// and about: URLs params are never used for navigation, only to
138 // prepopulate data on the page, so don't include their params.
139 if (url.SchemeIs(url::kAboutScheme) || url.SchemeIs("chrome")) {
140 remove_params.ClearQuery();
142 if (url.SchemeIs(kExtensionScheme)) {
143 remove_params.ClearPath();
144 remove_params.ClearQuery();
145 remove_params.ClearRef();
147 return url.ReplaceComponents(remove_params);
150 void AppendWhitelistedUrls(
151 const std::map<SourceId, std::unique_ptr<UkmSource>>& sources,
152 std::unordered_set<std::string>* urls) {
153 for (const auto& kv : sources) {
154 if (IsWhitelistedSourceId(kv.first)) {
155 urls->insert(kv.second->url().spec());
156 // Some non-navigation sources only record origin as a URL.
157 // Add the origin from the navigation source to match those too.
158 urls->insert(kv.second->url().GetOrigin().spec());
163 bool HasUnknownMetrics(const ukm::builders::DecodeMap& decode_map,
164 const mojom::UkmEntry& entry) {
165 const auto it = decode_map.find(entry.event_hash);
166 if (it == decode_map.end())
168 const auto& metric_map = it->second.metric_map;
169 for (const auto& metric : entry.metrics) {
170 if (metric_map.count(metric.first) == 0)
178 UkmRecorderImpl::UkmRecorderImpl() : recording_enabled_(false) {}
179 UkmRecorderImpl::~UkmRecorderImpl() = default;
182 void UkmRecorderImpl::CreateFallbackSamplingTrial(
183 bool is_stable_channel,
184 base::FeatureList* feature_list) {
185 static const char kSampledGroup_Stable[] = "Sampled_NoSeed_Stable";
186 static const char kSampledGroup_Other[] = "Sampled_NoSeed_Other";
187 const char* sampled_group = kSampledGroup_Other;
188 int default_sampling = 1; // Sampling is 1-in-N; this is N.
190 // Nothing is sampled out except for "stable" which omits almost everything
191 // in this configuration. This is done so that clients that fail to receive
192 // a configuration from the server do not bias aggregated results because
193 // of a relatively large number of records from them.
194 if (is_stable_channel) {
195 sampled_group = kSampledGroup_Stable;
196 default_sampling = 1000000;
199 scoped_refptr<base::FieldTrial> trial(
200 base::FieldTrialList::FactoryGetFieldTrial(
201 kUkmSamplingRateFeature.name, 100, sampled_group,
202 base::FieldTrialList::kNoExpirationYear, 1, 1,
203 base::FieldTrial::ONE_TIME_RANDOMIZED, nullptr));
205 // Everybody (100%) should have a sampling configuration.
206 std::map<std::string, std::string> params = {
207 {"_default_sampling", base::IntToString(default_sampling)}};
208 variations::AssociateVariationParams(trial->trial_name(), sampled_group,
210 trial->AppendGroup(sampled_group, 100);
212 // Setup the feature.
213 feature_list->RegisterFieldTrialOverride(
214 kUkmSamplingRateFeature.name, base::FeatureList::OVERRIDE_ENABLE_FEATURE,
218 UkmRecorderImpl::EventAggregate::EventAggregate() = default;
219 UkmRecorderImpl::EventAggregate::~EventAggregate() = default;
221 UkmRecorderImpl::Recordings::Recordings() = default;
222 UkmRecorderImpl::Recordings& UkmRecorderImpl::Recordings::operator=(
223 Recordings&&) = default;
224 UkmRecorderImpl::Recordings::~Recordings() = default;
226 void UkmRecorderImpl::Recordings::Reset() {
227 *this = Recordings();
230 void UkmRecorderImpl::Recordings::SourceCounts::Reset() {
231 *this = SourceCounts();
234 void UkmRecorderImpl::EnableRecording(bool extensions) {
235 DVLOG(1) << "UkmRecorderImpl::EnableRecording, extensions=" << extensions;
236 recording_enabled_ = true;
237 extensions_enabled_ = extensions;
240 void UkmRecorderImpl::DisableRecording() {
241 DVLOG(1) << "UkmRecorderImpl::DisableRecording";
242 if (recording_enabled_)
243 recording_is_continuous_ = false;
244 recording_enabled_ = false;
245 extensions_enabled_ = false;
248 void UkmRecorderImpl::DisableSamplingForTesting() {
249 sampling_enabled_ = false;
252 void UkmRecorderImpl::Purge() {
253 DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_);
255 recording_is_continuous_ = false;
258 void UkmRecorderImpl::SetIsWebstoreExtensionCallback(
259 const IsWebstoreExtensionCallback& callback) {
260 is_webstore_extension_callback_ = callback;
263 void UkmRecorderImpl::StoreRecordingsInReport(Report* report) {
264 DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_);
266 std::set<SourceId> ids_seen;
267 for (const auto& entry : recordings_.entries) {
268 Entry* proto_entry = report->add_entries();
269 StoreEntryProto(*entry, proto_entry);
270 ids_seen.insert(entry->source_id);
273 std::unordered_set<std::string> url_whitelist;
274 recordings_.carryover_urls_whitelist.swap(url_whitelist);
275 AppendWhitelistedUrls(recordings_.sources, &url_whitelist);
277 std::vector<std::unique_ptr<UkmSource>> unsent_sources;
278 int unmatched_sources = 0;
279 for (auto& kv : recordings_.sources) {
280 // If the source id is not whitelisted, don't send it unless it has
281 // associated entries and the URL matches a URL of a whitelisted source.
282 // Note: If ShouldRestrictToWhitelistedSourceIds() is true, this logic will
283 // not be hit as the source would have already been filtered in
284 // UpdateSourceURL().
285 if (!IsWhitelistedSourceId(kv.first)) {
286 // UkmSource should not keep initial_url for non-navigation source IDs.
287 DCHECK_EQ(1u, kv.second->urls().size());
288 if (!url_whitelist.count(kv.second->url().spec())) {
289 RecordDroppedSource(DroppedDataReason::NOT_MATCHED);
293 if (!base::ContainsKey(ids_seen, kv.first)) {
294 unsent_sources.push_back(std::move(kv.second));
298 Source* proto_source = report->add_sources();
299 kv.second->PopulateProto(proto_source);
300 if (!ShouldRecordInitialUrl())
301 proto_source->clear_initial_url();
303 for (const auto& event_and_aggregate : recordings_.event_aggregations) {
304 if (event_and_aggregate.second.metrics.empty())
306 const EventAggregate& event_aggregate = event_and_aggregate.second;
307 Aggregate* proto_aggregate = report->add_aggregates();
308 proto_aggregate->set_source_id(0); // Across all sources.
309 proto_aggregate->set_event_hash(event_and_aggregate.first);
310 proto_aggregate->set_total_count(event_aggregate.total_count);
311 proto_aggregate->set_dropped_due_to_limits(
312 event_aggregate.dropped_due_to_limits);
313 proto_aggregate->set_dropped_due_to_sampling(
314 event_aggregate.dropped_due_to_sampling);
315 proto_aggregate->set_dropped_due_to_whitelist(
316 event_aggregate.dropped_due_to_whitelist);
317 for (const auto& metric_and_aggregate : event_aggregate.metrics) {
318 const MetricAggregate& aggregate = metric_and_aggregate.second;
319 Aggregate::Metric* proto_metric = proto_aggregate->add_metrics();
320 proto_metric->set_metric_hash(metric_and_aggregate.first);
321 proto_metric->set_value_sum(aggregate.value_sum);
322 proto_metric->set_value_square_sum(aggregate.value_square_sum);
323 if (aggregate.total_count != event_aggregate.total_count) {
324 proto_metric->set_total_count(aggregate.total_count);
326 if (aggregate.dropped_due_to_limits !=
327 event_aggregate.dropped_due_to_limits) {
328 proto_metric->set_dropped_due_to_limits(
329 aggregate.dropped_due_to_limits);
331 if (aggregate.dropped_due_to_sampling !=
332 event_aggregate.dropped_due_to_sampling) {
333 proto_metric->set_dropped_due_to_sampling(
334 aggregate.dropped_due_to_sampling);
336 if (aggregate.dropped_due_to_whitelist !=
337 event_aggregate.dropped_due_to_whitelist) {
338 proto_metric->set_dropped_due_to_whitelist(
339 aggregate.dropped_due_to_whitelist);
344 UMA_HISTOGRAM_COUNTS_1000("UKM.Sources.SerializedCount",
345 recordings_.sources.size() - unsent_sources.size());
346 UMA_HISTOGRAM_COUNTS_100000("UKM.Entries.SerializedCount2",
347 recordings_.entries.size());
348 UMA_HISTOGRAM_COUNTS_1000("UKM.Sources.UnsentSourcesCount",
349 unsent_sources.size());
351 Report::SourceCounts* source_counts_proto = report->mutable_source_counts();
352 source_counts_proto->set_observed(recordings_.source_counts.observed);
353 source_counts_proto->set_navigation_sources(
354 recordings_.source_counts.navigation_sources);
355 source_counts_proto->set_unmatched_sources(unmatched_sources);
356 source_counts_proto->set_deferred_sources(unsent_sources.size());
357 source_counts_proto->set_carryover_sources(
358 recordings_.source_counts.carryover_sources);
360 recordings_.sources.clear();
361 recordings_.source_counts.Reset();
362 recordings_.entries.clear();
363 recordings_.event_aggregations.clear();
365 report->set_is_continuous(recording_is_continuous_);
366 recording_is_continuous_ = true;
368 // Keep at most |max_kept_sources|, prioritizing most-recent entries (by
370 const size_t max_kept_sources = GetMaxKeptSources();
371 if (unsent_sources.size() > max_kept_sources) {
372 std::nth_element(unsent_sources.begin(),
373 unsent_sources.begin() + max_kept_sources,
374 unsent_sources.end(),
375 [](const std::unique_ptr<ukm::UkmSource>& lhs,
376 const std::unique_ptr<ukm::UkmSource>& rhs) {
377 return lhs->creation_time() > rhs->creation_time();
379 unsent_sources.resize(max_kept_sources);
382 for (auto& source : unsent_sources) {
383 // We already matched these sources against the URL whitelist.
384 // Re-whitelist them for the next report.
385 recordings_.carryover_urls_whitelist.insert(source->url().spec());
386 recordings_.sources.emplace(source->id(), std::move(source));
388 UMA_HISTOGRAM_COUNTS_1000("UKM.Sources.KeptSourcesCount",
389 recordings_.sources.size());
390 recordings_.source_counts.carryover_sources = recordings_.sources.size();
393 bool UkmRecorderImpl::ShouldRestrictToWhitelistedSourceIds() const {
394 return base::GetFieldTrialParamByFeatureAsBool(
395 kUkmFeature, "RestrictToWhitelistedSourceIds", false);
398 bool UkmRecorderImpl::ShouldRestrictToWhitelistedEntries() const {
402 void UkmRecorderImpl::UpdateSourceURL(SourceId source_id,
403 const GURL& unsanitized_url) {
404 DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_);
406 if (base::ContainsKey(recordings_.sources, source_id))
409 const GURL sanitized_url = SanitizeURL(unsanitized_url);
410 if (!ShouldRecordUrl(source_id, sanitized_url))
413 RecordSource(std::make_unique<UkmSource>(source_id, sanitized_url));
416 void UkmRecorderImpl::UpdateAppURL(SourceId source_id, const GURL& url) {
417 if (!extensions_enabled_) {
418 RecordDroppedSource(DroppedDataReason::EXTENSION_URLS_DISABLED);
421 UpdateSourceURL(source_id, url);
424 void UkmRecorderImpl::RecordNavigation(
426 const UkmSource::NavigationData& unsanitized_navigation_data) {
427 DCHECK(GetSourceIdType(source_id) == SourceIdType::NAVIGATION_ID);
428 DCHECK(!base::ContainsKey(recordings_.sources, source_id));
429 // TODO(csharrison): Consider changing this behavior so the Source isn't event
430 // recorded at all if the final URL in |unsanitized_navigation_data| should
432 std::vector<GURL> urls;
433 for (const GURL& url : unsanitized_navigation_data.urls) {
434 const GURL sanitized_url = SanitizeURL(url);
435 if (ShouldRecordUrl(source_id, sanitized_url))
436 urls.push_back(std::move(sanitized_url));
439 // None of the URLs passed the ShouldRecordUrl check, so do not create a new
444 UkmSource::NavigationData sanitized_navigation_data =
445 unsanitized_navigation_data.CopyWithSanitizedUrls(urls);
447 std::make_unique<UkmSource>(source_id, sanitized_navigation_data));
450 bool UkmRecorderImpl::ShouldRecordUrl(SourceId source_id,
451 const GURL& sanitized_url) const {
452 if (!recording_enabled_) {
453 RecordDroppedSource(DroppedDataReason::RECORDING_DISABLED);
457 if (recordings_.sources.size() >= GetMaxSources()) {
458 RecordDroppedSource(DroppedDataReason::MAX_HIT);
462 if (ShouldRestrictToWhitelistedSourceIds() &&
463 !IsWhitelistedSourceId(source_id)) {
464 RecordDroppedSource(DroppedDataReason::NOT_WHITELISTED);
468 if (sanitized_url.is_empty()) {
469 RecordDroppedSource(DroppedDataReason::EMPTY_URL);
473 if (!HasSupportedScheme(sanitized_url)) {
474 RecordDroppedSource(DroppedDataReason::UNSUPPORTED_URL_SCHEME);
475 DVLOG(2) << "Dropped Unsupported UKM URL:" << source_id << ":"
476 << sanitized_url.spec();
480 // Extension URLs need to be specifically enabled and the extension synced.
481 if (sanitized_url.SchemeIs(kExtensionScheme)) {
482 DCHECK_EQ(sanitized_url.GetWithEmptyPath(), sanitized_url);
483 if (!extensions_enabled_) {
484 RecordDroppedSource(DroppedDataReason::EXTENSION_URLS_DISABLED);
487 if (!is_webstore_extension_callback_ ||
488 !is_webstore_extension_callback_.Run(sanitized_url.host_piece())) {
489 RecordDroppedSource(DroppedDataReason::EXTENSION_NOT_SYNCED);
496 void UkmRecorderImpl::RecordSource(std::unique_ptr<UkmSource> source) {
497 SourceId source_id = source->id();
498 if (GetSourceIdType(source_id) == SourceIdType::NAVIGATION_ID)
499 recordings_.source_counts.navigation_sources++;
500 recordings_.source_counts.observed++;
501 recordings_.sources.emplace(source_id, std::move(source));
504 void UkmRecorderImpl::AddEntry(mojom::UkmEntryPtr entry) {
505 DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_);
507 DCHECK(!HasUnknownMetrics(decode_map_, *entry));
509 if (!recording_enabled_) {
510 RecordDroppedEntry(DroppedDataReason::RECORDING_DISABLED);
514 EventAggregate& event_aggregate =
515 recordings_.event_aggregations[entry->event_hash];
516 event_aggregate.total_count++;
517 for (const auto& metric : entry->metrics) {
518 MetricAggregate& aggregate = event_aggregate.metrics[metric.first];
519 double value = metric.second;
520 aggregate.total_count++;
521 aggregate.value_sum += value;
522 aggregate.value_square_sum += value * value;
525 if (ShouldRestrictToWhitelistedEntries() &&
526 !base::ContainsKey(whitelisted_entry_hashes_, entry->event_hash)) {
527 RecordDroppedEntry(DroppedDataReason::NOT_WHITELISTED);
528 event_aggregate.dropped_due_to_whitelist++;
529 for (auto& metric : entry->metrics)
530 event_aggregate.metrics[metric.first].dropped_due_to_whitelist++;
534 if (default_sampling_rate_ == 0)
535 LoadExperimentSamplingInfo();
537 auto found = event_sampling_rates_.find(entry->event_hash);
538 int sampling_rate = (found != event_sampling_rates_.end())
540 : default_sampling_rate_;
541 if (sampling_enabled_ &&
542 (sampling_rate == 0 ||
543 (sampling_rate > 1 && base::RandInt(1, sampling_rate) != 1))) {
544 RecordDroppedEntry(DroppedDataReason::SAMPLED_OUT);
545 event_aggregate.dropped_due_to_sampling++;
546 for (auto& metric : entry->metrics)
547 event_aggregate.metrics[metric.first].dropped_due_to_sampling++;
551 if (recordings_.entries.size() >= GetMaxEntries()) {
552 RecordDroppedEntry(DroppedDataReason::MAX_HIT);
553 event_aggregate.dropped_due_to_limits++;
554 for (auto& metric : entry->metrics)
555 event_aggregate.metrics[metric.first].dropped_due_to_limits++;
559 recordings_.entries.push_back(std::move(entry));
562 void UkmRecorderImpl::LoadExperimentSamplingInfo() {
563 DCHECK_EQ(0, default_sampling_rate_);
564 std::map<std::string, std::string> params;
566 if (base::FeatureList::IsEnabled(kUkmSamplingRateFeature)) {
567 // Enabled may have various parameters to control sampling.
568 if (base::GetFieldTrialParamsByFeature(kUkmSamplingRateFeature, ¶ms)) {
569 for (const auto& kv : params) {
570 const std::string& key = kv.first;
571 if (key.length() == 0)
574 // Keys starting with an underscore are global configuration.
575 if (key.at(0) == '_') {
576 if (key == "_default_sampling") {
578 if (base::StringToInt(kv.second, &sampling) && sampling >= 0)
579 default_sampling_rate_ = sampling;
584 // Anything else is an event name.
586 if (base::StringToInt(kv.second, &sampling) && sampling >= 0)
587 event_sampling_rates_[base::HashMetricName(key)] = sampling;
592 // Default rate must be >0 to indicate that load is complete.
593 if (default_sampling_rate_ == 0)
594 default_sampling_rate_ = 1;
597 void UkmRecorderImpl::StoreWhitelistedEntries() {
598 DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_);
600 base::SplitString(GetWhitelistEntries(), ",", base::TRIM_WHITESPACE,
601 base::SPLIT_WANT_NONEMPTY);
602 for (const auto& entry_string : entries)
603 whitelisted_entry_hashes_.insert(base::HashMetricName(entry_string));
604 decode_map_ = ::ukm::builders::CreateDecodeMap();