1 # Copyright 2014 The Chromium Authors. All rights reserved.
2 # Use of this source code is governed by a BSD-style license that can be
3 # found in the LICENSE file.
5 """A collection of statistical utility functions to be used by metrics."""
10 def Clamp(value, low=0.0, high=1.0):
11 """Clamp a value between some low and high value."""
12 return min(max(value, low), high)
15 def NormalizeSamples(samples):
16 """Sorts the samples, and map them linearly to the range [0,1].
18 They're mapped such that for the N samples, the first sample is 0.5/N and the
19 last sample is (N-0.5)/N.
21 Background: The discrepancy of the sample set i/(N-1); i=0, ..., N-1 is 2/N,
22 twice the discrepancy of the sample set (i+1/2)/N; i=0, ..., N-1. In our case
23 we don't want to distinguish between these two cases, as our original domain
24 is not bounded (it is for Monte Carlo integration, where discrepancy was
29 samples = sorted(samples)
32 new_low = 0.5 / len(samples)
33 new_high = (len(samples)-0.5) / len(samples)
35 return [0.5] * len(samples), 1.0
36 scale = (new_high - new_low) / (high - low)
37 for i in xrange(0, len(samples)):
38 samples[i] = float(samples[i] - low) * scale + new_low
42 def Discrepancy(samples, location_count=None):
43 """Computes the discrepancy of a set of 1D samples from the interval [0,1].
45 The samples must be sorted. We define the discrepancy of an empty set
46 of samples to be zero.
48 http://en.wikipedia.org/wiki/Low-discrepancy_sequence
49 http://mathworld.wolfram.com/Discrepancy.html
54 max_local_discrepancy = 0
55 inv_sample_count = 1.0 / len(samples)
57 # For each location, stores the number of samples less than that location.
59 # For each location, stores the number of samples less than or equal to that
64 # Generate list of equally spaced locations.
66 for i in xrange(0, int(location_count)):
67 location = float(i) / (location_count-1)
68 locations.append(location)
69 while sample_index < len(samples) and samples[sample_index] < location:
71 count_less.append(sample_index)
72 while sample_index < len(samples) and samples[sample_index] <= location:
74 count_less_equal.append(sample_index)
76 # Populate locations with sample positions. Append 0 and 1 if necessary.
80 count_less_equal.append(0)
81 for i in xrange(0, len(samples)):
82 locations.append(samples[i])
84 count_less_equal.append(i+1)
87 count_less.append(len(samples))
88 count_less_equal.append(len(samples))
90 # Iterate over the intervals defined by any pair of locations.
91 for i in xrange(0, len(locations)):
92 for j in xrange(i+1, len(locations)):
94 length = locations[j] - locations[i]
96 # Local discrepancy for closed interval
97 count_closed = count_less_equal[j] - count_less[i]
98 local_discrepancy_closed = abs(float(count_closed) *
99 inv_sample_count - length)
100 max_local_discrepancy = max(local_discrepancy_closed,
101 max_local_discrepancy)
103 # Local discrepancy for open interval
104 count_open = count_less[j] - count_less_equal[i]
105 local_discrepancy_open = abs(float(count_open) *
106 inv_sample_count - length)
107 max_local_discrepancy = max(local_discrepancy_open,
108 max_local_discrepancy)
110 return max_local_discrepancy
113 def TimestampsDiscrepancy(timestamps, absolute=True,
114 location_count=None):
115 """A discrepancy based metric for measuring timestamp jank.
117 TimestampsDiscrepancy quantifies the largest area of jank observed in a series
118 of timestamps. Note that this is different from metrics based on the
119 max_time_interval. For example, the time stamp series A = [0,1,2,3,5,6] and
120 B = [0,1,2,3,5,7] have the same max_time_interval = 2, but
121 Discrepancy(B) > Discrepancy(A).
123 Two variants of discrepancy can be computed:
125 Relative discrepancy is following the original definition of
126 discrepancy. It characterized the largest area of jank, relative to the
127 duration of the entire time stamp series. We normalize the raw results,
128 because the best case discrepancy for a set of N samples is 1/N (for
129 equally spaced samples), and we want our metric to report 0.0 in that
132 Absolute discrepancy also characterizes the largest area of jank, but its
133 value wouldn't change (except for imprecisions due to a low
134 |interval_multiplier|) if additional 'good' intervals were added to an
135 exisiting list of time stamps. Its range is [0,inf] and the unit is
138 The time stamp series C = [0,2,3,4] and D = [0,2,3,4,5] have the same
139 absolute discrepancy, but D has lower relative discrepancy than C.
141 |timestamps| may be a list of lists S = [S_1, S_2, ..., S_N], where each
142 S_i is a time stamp series. In that case, the discrepancy D(S) is:
143 D(S) = max(D(S_1), D(S_2), ..., D(S_N))
148 if isinstance(timestamps[0], list):
149 range_discrepancies = [TimestampsDiscrepancy(r) for r in timestamps]
150 return max(range_discrepancies)
152 samples, sample_scale = NormalizeSamples(timestamps)
153 discrepancy = Discrepancy(samples, location_count)
154 inv_sample_count = 1.0 / len(samples)
156 # Compute absolute discrepancy
157 discrepancy /= sample_scale
159 # Compute relative discrepancy
160 discrepancy = Clamp((discrepancy-inv_sample_count) / (1.0-inv_sample_count))
164 def DurationsDiscrepancy(durations, absolute=True,
165 location_count=None):
166 """A discrepancy based metric for measuring duration jank.
168 DurationsDiscrepancy computes a jank metric which measures how irregular a
169 given sequence of intervals is. In order to minimize jank, each duration
170 should be equally long. This is similar to how timestamp jank works,
171 and we therefore reuse the timestamp discrepancy function above to compute a
172 similar duration discrepancy number.
174 Because timestamp discrepancy is defined in terms of timestamps, we first
175 convert the list of durations to monotonically increasing timestamps.
178 durations: List of interval lengths in milliseconds.
179 absolute: See TimestampsDiscrepancy.
180 interval_multiplier: See TimestampsDiscrepancy.
185 timestamps = reduce(lambda x, y: x + [x[-1] + y], durations, [0])
186 return TimestampsDiscrepancy(timestamps, absolute, location_count)
189 def ArithmeticMean(data):
190 """Calculates arithmetic mean.
193 data: A list of samples.
196 The arithmetic mean value, or 0 if the list is empty.
198 numerator_total = Total(data)
199 denominator_total = Total(len(data))
200 return DivideIfPossibleOrZero(numerator_total, denominator_total)
203 def StandardDeviation(data):
204 """Calculates the standard deviation.
207 data: A list of samples.
210 The standard deviation of the samples provided.
215 mean = ArithmeticMean(data)
216 variances = [float(x) - mean for x in data]
217 variances = [x * x for x in variances]
218 std_dev = math.sqrt(ArithmeticMean(variances))
223 def TrapezoidalRule(data, dx):
224 """ Calculate the integral according to the trapezoidal rule
226 TrapezoidalRule approximates the definite integral of f from a to b by
227 the composite trapezoidal rule, using n subintervals.
228 http://en.wikipedia.org/wiki/Trapezoidal_rule#Uniform_grid
231 data: A list of samples
232 dx: The uniform distance along the x axis between any two samples
235 The area under the curve defined by the samples and the uniform distance
236 according to the trapezoidal rule.
240 s = data[0] + data[n]
245 for i in range(1, n):
251 """Returns the float value of a number or the sum of a list."""
252 if type(data) == float:
254 elif type(data) == int:
256 elif type(data) == list:
257 total = float(sum(data))
263 def DivideIfPossibleOrZero(numerator, denominator):
264 """Returns the quotient, or zero if the denominator is zero."""
268 return numerator / denominator
271 def GeneralizedMean(values, exponent):
272 """See http://en.wikipedia.org/wiki/Generalized_mean"""
277 sum_of_powers += v ** exponent
278 return (sum_of_powers / len(values)) ** (1.0/exponent)
282 """Gets the median of a list of values."""
283 return Percentile(values, 50)
286 def Percentile(values, percentile):
287 """Calculates the value below which a given percentage of values fall.
289 For example, if 17% of the values are less than 5.0, then 5.0 is the 17th
290 percentile for this set of values. When the percentage doesn't exactly
291 match a rank in the list of values, the percentile is computed using linear
292 interpolation between closest ranks.
295 values: A list of numerical values.
296 percentile: A number between 0 and 100.
299 The Nth percentile for the list of values, where N is the given percentage.
303 sorted_values = sorted(values)
306 if percentile <= 0.5 / n:
307 return sorted_values[0]
308 elif percentile >= (n - 0.5) / n:
309 return sorted_values[-1]
311 floor_index = int(math.floor(n * percentile - 0.5))
312 floor_value = sorted_values[floor_index]
313 ceil_value = sorted_values[floor_index+1]
314 alpha = n * percentile - 0.5 - floor_index
315 return floor_value + alpha * (ceil_value - floor_value)
318 def GeometricMean(values):
319 """Compute a rounded geometric mean from an array of values."""
322 # To avoid infinite value errors, make sure no value is less than 0.001.
326 new_values.append(value)
328 new_values.append(0.001)
329 # Compute the sum of the log of the values.
330 log_sum = sum(map(math.log, new_values))
331 # Raise e to that sum over the number of values.
332 mean = math.pow(math.e, (log_sum / len(new_values)))
333 # Return the rounded mean.
334 return int(round(mean))