src/tools/telemetry/telemetry/util/statistics.py

   1 # Copyright 2014 The Chromium Authors. All rights reserved.
   2 # Use of this source code is governed by a BSD-style license that can be
   3 # found in the LICENSE file.
   4
   5 """A collection of statistical utility functions to be used by metrics."""
   6
   7 import math
   8
   9
  10 def Clamp(value, low=0.0, high=1.0):
  11   """Clamp a value between some low and high value."""
  12   return min(max(value, low), high)
  13
  14
  15 def NormalizeSamples(samples):
  16   """Sorts the samples, and map them linearly to the range [0,1].
  17
  18   They're mapped such that for the N samples, the first sample is 0.5/N and the
  19   last sample is (N-0.5)/N.
  20
  21   Background: The discrepancy of the sample set i/(N-1); i=0, ..., N-1 is 2/N,
  22   twice the discrepancy of the sample set (i+1/2)/N; i=0, ..., N-1. In our case
  23   we don't want to distinguish between these two cases, as our original domain
  24   is not bounded (it is for Monte Carlo integration, where discrepancy was
  25   first used).
  26   """
  27   if not samples:
  28     return samples, 1.0
  29   samples = sorted(samples)
  30   low = min(samples)
  31   high = max(samples)
  32   new_low = 0.5 / len(samples)
  33   new_high = (len(samples)-0.5) / len(samples)
  34   if high-low == 0.0:
  35     return [0.5] * len(samples), 1.0
  36   scale = (new_high - new_low) / (high - low)
  37   for i in xrange(0, len(samples)):
  38     samples[i] = float(samples[i] - low) * scale + new_low
  39   return samples, scale
  40
  41
  42 def Discrepancy(samples, location_count=None):
  43   """Computes the discrepancy of a set of 1D samples from the interval [0,1].
  44
  45   The samples must be sorted. We define the discrepancy of an empty set
  46   of samples to be zero.
  47
  48   http://en.wikipedia.org/wiki/Low-discrepancy_sequence
  49   http://mathworld.wolfram.com/Discrepancy.html
  50   """
  51   if not samples:
  52     return 0.0
  53
  54   max_local_discrepancy = 0
  55   inv_sample_count = 1.0 / len(samples)
  56   locations = []
  57   # For each location, stores the number of samples less than that location.
  58   count_less = []
  59   # For each location, stores the number of samples less than or equal to that
  60   # location.
  61   count_less_equal = []
  62
  63   if location_count:
  64     # Generate list of equally spaced locations.
  65     sample_index = 0
  66     for i in xrange(0, int(location_count)):
  67       location = float(i) / (location_count-1)
  68       locations.append(location)
  69       while sample_index < len(samples) and samples[sample_index] < location:
  70         sample_index += 1
  71       count_less.append(sample_index)
  72       while  sample_index < len(samples) and samples[sample_index] <= location:
  73         sample_index += 1
  74       count_less_equal.append(sample_index)
  75   else:
  76     # Populate locations with sample positions. Append 0 and 1 if necessary.
  77     if samples[0] > 0.0:
  78       locations.append(0.0)
  79       count_less.append(0)
  80       count_less_equal.append(0)
  81     for i in xrange(0, len(samples)):
  82       locations.append(samples[i])
  83       count_less.append(i)
  84       count_less_equal.append(i+1)
  85     if samples[-1] < 1.0:
  86       locations.append(1.0)
  87       count_less.append(len(samples))
  88       count_less_equal.append(len(samples))
  89
  90   # Iterate over the intervals defined by any pair of locations.
  91   for i in xrange(0, len(locations)):
  92     for j in xrange(i+1, len(locations)):
  93       # Length of interval
  94       length = locations[j] - locations[i]
  95
  96       # Local discrepancy for closed interval
  97       count_closed = count_less_equal[j] - count_less[i]
  98       local_discrepancy_closed = abs(float(count_closed) *
  99                                      inv_sample_count - length)
 100       max_local_discrepancy = max(local_discrepancy_closed,
 101                                   max_local_discrepancy)
 102
 103       # Local discrepancy for open interval
 104       count_open = count_less[j] - count_less_equal[i]
 105       local_discrepancy_open = abs(float(count_open) *
 106                                    inv_sample_count - length)
 107       max_local_discrepancy = max(local_discrepancy_open,
 108                                   max_local_discrepancy)
 109
 110   return max_local_discrepancy
 111
 112
 113 def TimestampsDiscrepancy(timestamps, absolute=True,
 114                           location_count=None):
 115   """A discrepancy based metric for measuring timestamp jank.
 116
 117   TimestampsDiscrepancy quantifies the largest area of jank observed in a series
 118   of timestamps.  Note that this is different from metrics based on the
 119   max_time_interval. For example, the time stamp series A = [0,1,2,3,5,6] and
 120   B = [0,1,2,3,5,7] have the same max_time_interval = 2, but
 121   Discrepancy(B) > Discrepancy(A).
 122
 123   Two variants of discrepancy can be computed:
 124
 125   Relative discrepancy is following the original definition of
 126   discrepancy. It characterized the largest area of jank, relative to the
 127   duration of the entire time stamp series.  We normalize the raw results,
 128   because the best case discrepancy for a set of N samples is 1/N (for
 129   equally spaced samples), and we want our metric to report 0.0 in that
 130   case.
 131
 132   Absolute discrepancy also characterizes the largest area of jank, but its
 133   value wouldn't change (except for imprecisions due to a low
 134   |interval_multiplier|) if additional 'good' intervals were added to an
 135   exisiting list of time stamps.  Its range is [0,inf] and the unit is
 136   milliseconds.
 137
 138   The time stamp series C = [0,2,3,4] and D = [0,2,3,4,5] have the same
 139   absolute discrepancy, but D has lower relative discrepancy than C.
 140
 141   |timestamps| may be a list of lists S = [S_1, S_2, ..., S_N], where each
 142   S_i is a time stamp series. In that case, the discrepancy D(S) is:
 143   D(S) = max(D(S_1), D(S_2), ..., D(S_N))
 144   """
 145   if not timestamps:
 146     return 0.0
 147
 148   if isinstance(timestamps[0], list):
 149     range_discrepancies = [TimestampsDiscrepancy(r) for r in timestamps]
 150     return max(range_discrepancies)
 151
 152   samples, sample_scale = NormalizeSamples(timestamps)
 153   discrepancy = Discrepancy(samples, location_count)
 154   inv_sample_count = 1.0 / len(samples)
 155   if absolute:
 156     # Compute absolute discrepancy
 157     discrepancy /= sample_scale
 158   else:
 159     # Compute relative discrepancy
 160     discrepancy = Clamp((discrepancy-inv_sample_count) / (1.0-inv_sample_count))
 161   return discrepancy
 162
 163
 164 def DurationsDiscrepancy(durations, absolute=True,
 165                          location_count=None):
 166   """A discrepancy based metric for measuring duration jank.
 167
 168   DurationsDiscrepancy computes a jank metric which measures how irregular a
 169   given sequence of intervals is. In order to minimize jank, each duration
 170   should be equally long. This is similar to how timestamp jank works,
 171   and we therefore reuse the timestamp discrepancy function above to compute a
 172   similar duration discrepancy number.
 173
 174   Because timestamp discrepancy is defined in terms of timestamps, we first
 175   convert the list of durations to monotonically increasing timestamps.
 176
 177   Args:
 178     durations: List of interval lengths in milliseconds.
 179     absolute: See TimestampsDiscrepancy.
 180     interval_multiplier: See TimestampsDiscrepancy.
 181   """
 182   if not durations:
 183     return 0.0
 184
 185   timestamps = reduce(lambda x, y: x + [x[-1] + y], durations, [0])
 186   return TimestampsDiscrepancy(timestamps, absolute, location_count)
 187
 188
 189 def ArithmeticMean(data):
 190   """Calculates arithmetic mean.
 191
 192   Args:
 193     data: A list of samples.
 194
 195   Returns:
 196     The arithmetic mean value, or 0 if the list is empty.
 197   """
 198   numerator_total = Total(data)
 199   denominator_total = Total(len(data))
 200   return DivideIfPossibleOrZero(numerator_total, denominator_total)
 201
 202
 203 def StandardDeviation(data):
 204   """Calculates the standard deviation.
 205
 206   Args:
 207     data: A list of samples.
 208
 209   Returns:
 210     The standard deviation of the samples provided.
 211   """
 212   if len(data) == 1:
 213     return 0.0
 214
 215   mean = ArithmeticMean(data)
 216   variances = [float(x) - mean for x in data]
 217   variances = [x * x for x in variances]
 218   std_dev = math.sqrt(ArithmeticMean(variances))
 219
 220   return std_dev
 221
 222
 223 def TrapezoidalRule(data, dx):
 224   """ Calculate the integral according to the trapezoidal rule
 225
 226   TrapezoidalRule approximates the definite integral of f from a to b by
 227   the composite trapezoidal rule, using n subintervals.
 228   http://en.wikipedia.org/wiki/Trapezoidal_rule#Uniform_grid
 229
 230   Args:
 231     data: A list of samples
 232     dx: The uniform distance along the x axis between any two samples
 233
 234   Returns:
 235     The area under the curve defined by the samples and the uniform distance
 236     according to the trapezoidal rule.
 237   """
 238
 239   n = len(data) - 1
 240   s = data[0] + data[n]
 241
 242   if n == 0:
 243     return 0.0
 244
 245   for i in range(1, n):
 246     s += 2 * data[i]
 247
 248   return s * dx / 2.0
 249
 250 def Total(data):
 251   """Returns the float value of a number or the sum of a list."""
 252   if type(data) == float:
 253     total = data
 254   elif type(data) == int:
 255     total = float(data)
 256   elif type(data) == list:
 257     total = float(sum(data))
 258   else:
 259     raise TypeError
 260   return total
 261
 262
 263 def DivideIfPossibleOrZero(numerator, denominator):
 264   """Returns the quotient, or zero if the denominator is zero."""
 265   if not denominator:
 266     return 0.0
 267   else:
 268     return numerator / denominator
 269
 270
 271 def GeneralizedMean(values, exponent):
 272   """See http://en.wikipedia.org/wiki/Generalized_mean"""
 273   if not values:
 274     return 0.0
 275   sum_of_powers = 0.0
 276   for v in values:
 277     sum_of_powers += v ** exponent
 278   return (sum_of_powers / len(values)) ** (1.0/exponent)
 279
 280
 281 def Median(values):
 282   """Gets the median of a list of values."""
 283   return Percentile(values, 50)
 284
 285
 286 def Percentile(values, percentile):
 287   """Calculates the value below which a given percentage of values fall.
 288
 289   For example, if 17% of the values are less than 5.0, then 5.0 is the 17th
 290   percentile for this set of values. When the percentage doesn't exactly
 291   match a rank in the list of values, the percentile is computed using linear
 292   interpolation between closest ranks.
 293
 294   Args:
 295     values: A list of numerical values.
 296     percentile: A number between 0 and 100.
 297
 298   Returns:
 299     The Nth percentile for the list of values, where N is the given percentage.
 300   """
 301   if not values:
 302     return 0.0
 303   sorted_values = sorted(values)
 304   n = len(values)
 305   percentile /= 100.0
 306   if percentile <= 0.5 / n:
 307     return sorted_values[0]
 308   elif percentile >= (n - 0.5) / n:
 309     return sorted_values[-1]
 310   else:
 311     floor_index = int(math.floor(n * percentile -  0.5))
 312     floor_value = sorted_values[floor_index]
 313     ceil_value = sorted_values[floor_index+1]
 314     alpha = n * percentile - 0.5 - floor_index
 315     return floor_value + alpha * (ceil_value - floor_value)
 316
 317
 318 def GeometricMean(values):
 319   """Compute a rounded geometric mean from an array of values."""
 320   if not values:
 321     return None
 322   # To avoid infinite value errors, make sure no value is less than 0.001.
 323   new_values = []
 324   for value in values:
 325     if value > 0.001:
 326       new_values.append(value)
 327     else:
 328       new_values.append(0.001)
 329   # Compute the sum of the log of the values.
 330   log_sum = sum(map(math.log, new_values))
 331   # Raise e to that sum over the number of values.
 332   mean = math.pow(math.e, (log_sum / len(new_values)))
 333   # Return the rounded mean.
 334   return int(round(mean))