boost/math/tools/univariate_statistics.hpp

   1 //  (C) Copyright Nick Thompson 2018.
   2 //  Use, modification and distribution are subject to the
   3 //  Boost Software License, Version 1.0. (See accompanying file
   4 //  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
   5
   6 #ifndef BOOST_MATH_TOOLS_UNIVARIATE_STATISTICS_HPP
   7 #define BOOST_MATH_TOOLS_UNIVARIATE_STATISTICS_HPP
   8
   9 #include <algorithm>
  10 #include <iterator>
  11 #include <tuple>
  12 #include <boost/assert.hpp>
  13 #include <boost/config/header_deprecated.hpp>
  14
  15 BOOST_HEADER_DEPRECATED("<boost/math/statistics/univariate_statistics.hpp>");
  16
  17 namespace boost::math::tools {
  18
  19 template<class ForwardIterator>
  20 auto mean(ForwardIterator first, ForwardIterator last)
  21 {
  22     using Real = typename std::iterator_traits<ForwardIterator>::value_type;
  23     BOOST_ASSERT_MSG(first != last, "At least one sample is required to compute the mean.");
  24     if constexpr (std::is_integral<Real>::value)
  25     {
  26         double mu = 0;
  27         double i = 1;
  28         for(auto it = first; it != last; ++it) {
  29             mu = mu + (*it - mu)/i;
  30             i += 1;
  31         }
  32         return mu;
  33     }
  34     else if constexpr (std::is_same_v<typename std::iterator_traits<ForwardIterator>::iterator_category, std::random_access_iterator_tag>)
  35     {
  36         size_t elements = std::distance(first, last);
  37         Real mu0 = 0;
  38         Real mu1 = 0;
  39         Real mu2 = 0;
  40         Real mu3 = 0;
  41         Real i = 1;
  42         auto end = last - (elements % 4);
  43         for(auto it = first; it != end;  it += 4) {
  44             Real inv = Real(1)/i;
  45             Real tmp0 = (*it - mu0);
  46             Real tmp1 = (*(it+1) - mu1);
  47             Real tmp2 = (*(it+2) - mu2);
  48             Real tmp3 = (*(it+3) - mu3);
  49             // please generate a vectorized fma here
  50             mu0 += tmp0*inv;
  51             mu1 += tmp1*inv;
  52             mu2 += tmp2*inv;
  53             mu3 += tmp3*inv;
  54             i += 1;
  55         }
  56         Real num1 = Real(elements  - (elements %4))/Real(4);
  57         Real num2 = num1 + Real(elements % 4);
  58
  59         for (auto it = end; it != last; ++it)
  60         {
  61             mu3 += (*it-mu3)/i;
  62             i += 1;
  63         }
  64
  65         return (num1*(mu0+mu1+mu2) + num2*mu3)/Real(elements);
  66     }
  67     else
  68     {
  69         auto it = first;
  70         Real mu = *it;
  71         Real i = 2;
  72         while(++it != last)
  73         {
  74             mu += (*it - mu)/i;
  75             i += 1;
  76         }
  77         return mu;
  78     }
  79 }
  80
  81 template<class Container>
  82 inline auto mean(Container const & v)
  83 {
  84     return mean(v.cbegin(), v.cend());
  85 }
  86
  87 template<class ForwardIterator>
  88 auto variance(ForwardIterator first, ForwardIterator last)
  89 {
  90     using Real = typename std::iterator_traits<ForwardIterator>::value_type;
  91     BOOST_ASSERT_MSG(first != last, "At least one sample is required to compute mean and variance.");
  92     // Higham, Accuracy and Stability, equation 1.6a and 1.6b:
  93     if constexpr (std::is_integral<Real>::value)
  94     {
  95         double M = *first;
  96         double Q = 0;
  97         double k = 2;
  98         for (auto it = std::next(first); it != last; ++it)
  99         {
 100             double tmp = *it - M;
 101             Q = Q + ((k-1)*tmp*tmp)/k;
 102             M = M + tmp/k;
 103             k += 1;
 104         }
 105         return Q/(k-1);
 106     }
 107     else
 108     {
 109         Real M = *first;
 110         Real Q = 0;
 111         Real k = 2;
 112         for (auto it = std::next(first); it != last; ++it)
 113         {
 114             Real tmp = (*it - M)/k;
 115             Q += k*(k-1)*tmp*tmp;
 116             M += tmp;
 117             k += 1;
 118         }
 119         return Q/(k-1);
 120     }
 121 }
 122
 123 template<class Container>
 124 inline auto variance(Container const & v)
 125 {
 126     return variance(v.cbegin(), v.cend());
 127 }
 128
 129 template<class ForwardIterator>
 130 auto sample_variance(ForwardIterator first, ForwardIterator last)
 131 {
 132     size_t n = std::distance(first, last);
 133     BOOST_ASSERT_MSG(n > 1, "At least two samples are required to compute the sample variance.");
 134     return n*variance(first, last)/(n-1);
 135 }
 136
 137 template<class Container>
 138 inline auto sample_variance(Container const & v)
 139 {
 140     return sample_variance(v.cbegin(), v.cend());
 141 }
 142
 143
 144 // Follows equation 1.5 of:
 145 // https://prod.sandia.gov/techlib-noauth/access-control.cgi/2008/086212.pdf
 146 template<class ForwardIterator>
 147 auto skewness(ForwardIterator first, ForwardIterator last)
 148 {
 149     using Real = typename std::iterator_traits<ForwardIterator>::value_type;
 150     BOOST_ASSERT_MSG(first != last, "At least one sample is required to compute skewness.");
 151     if constexpr (std::is_integral<Real>::value)
 152     {
 153         double M1 = *first;
 154         double M2 = 0;
 155         double M3 = 0;
 156         double n = 2;
 157         for (auto it = std::next(first); it != last; ++it)
 158         {
 159             double delta21 = *it - M1;
 160             double tmp = delta21/n;
 161             M3 = M3 + tmp*((n-1)*(n-2)*delta21*tmp - 3*M2);
 162             M2 = M2 + tmp*(n-1)*delta21;
 163             M1 = M1 + tmp;
 164             n += 1;
 165         }
 166
 167         double var = M2/(n-1);
 168         if (var == 0)
 169         {
 170             // The limit is technically undefined, but the interpretation here is clear:
 171             // A constant dataset has no skewness.
 172             return double(0);
 173         }
 174         double skew = M3/(M2*sqrt(var));
 175         return skew;
 176     }
 177     else
 178     {
 179         Real M1 = *first;
 180         Real M2 = 0;
 181         Real M3 = 0;
 182         Real n = 2;
 183         for (auto it = std::next(first); it != last; ++it)
 184         {
 185             Real delta21 = *it - M1;
 186             Real tmp = delta21/n;
 187             M3 += tmp*((n-1)*(n-2)*delta21*tmp - 3*M2);
 188             M2 += tmp*(n-1)*delta21;
 189             M1 += tmp;
 190             n += 1;
 191         }
 192
 193         Real var = M2/(n-1);
 194         if (var == 0)
 195         {
 196             // The limit is technically undefined, but the interpretation here is clear:
 197             // A constant dataset has no skewness.
 198             return Real(0);
 199         }
 200         Real skew = M3/(M2*sqrt(var));
 201         return skew;
 202     }
 203 }
 204
 205 template<class Container>
 206 inline auto skewness(Container const & v)
 207 {
 208     return skewness(v.cbegin(), v.cend());
 209 }
 210
 211 // Follows equation 1.5/1.6 of:
 212 // https://prod.sandia.gov/techlib-noauth/access-control.cgi/2008/086212.pdf
 213 template<class ForwardIterator>
 214 auto first_four_moments(ForwardIterator first, ForwardIterator last)
 215 {
 216     using Real = typename std::iterator_traits<ForwardIterator>::value_type;
 217     BOOST_ASSERT_MSG(first != last, "At least one sample is required to compute the first four moments.");
 218     if constexpr (std::is_integral<Real>::value)
 219     {
 220         double M1 = *first;
 221         double M2 = 0;
 222         double M3 = 0;
 223         double M4 = 0;
 224         double n = 2;
 225         for (auto it = std::next(first); it != last; ++it)
 226         {
 227             double delta21 = *it - M1;
 228             double tmp = delta21/n;
 229             M4 = M4 + tmp*(tmp*tmp*delta21*((n-1)*(n*n-3*n+3)) + 6*tmp*M2 - 4*M3);
 230             M3 = M3 + tmp*((n-1)*(n-2)*delta21*tmp - 3*M2);
 231             M2 = M2 + tmp*(n-1)*delta21;
 232             M1 = M1 + tmp;
 233             n += 1;
 234         }
 235
 236         return std::make_tuple(M1, M2/(n-1), M3/(n-1), M4/(n-1));
 237     }
 238     else
 239     {
 240         Real M1 = *first;
 241         Real M2 = 0;
 242         Real M3 = 0;
 243         Real M4 = 0;
 244         Real n = 2;
 245         for (auto it = std::next(first); it != last; ++it)
 246         {
 247             Real delta21 = *it - M1;
 248             Real tmp = delta21/n;
 249             M4 = M4 + tmp*(tmp*tmp*delta21*((n-1)*(n*n-3*n+3)) + 6*tmp*M2 - 4*M3);
 250             M3 = M3 + tmp*((n-1)*(n-2)*delta21*tmp - 3*M2);
 251             M2 = M2 + tmp*(n-1)*delta21;
 252             M1 = M1 + tmp;
 253             n += 1;
 254         }
 255
 256         return std::make_tuple(M1, M2/(n-1), M3/(n-1), M4/(n-1));
 257     }
 258 }
 259
 260 template<class Container>
 261 inline auto first_four_moments(Container const & v)
 262 {
 263     return first_four_moments(v.cbegin(), v.cend());
 264 }
 265
 266
 267 // Follows equation 1.6 of:
 268 // https://prod.sandia.gov/techlib-noauth/access-control.cgi/2008/086212.pdf
 269 template<class ForwardIterator>
 270 auto kurtosis(ForwardIterator first, ForwardIterator last)
 271 {
 272     auto [M1, M2, M3, M4] = first_four_moments(first, last);
 273     if (M2 == 0)
 274     {
 275         return M2;
 276     }
 277     return M4/(M2*M2);
 278 }
 279
 280 template<class Container>
 281 inline auto kurtosis(Container const & v)
 282 {
 283     return kurtosis(v.cbegin(), v.cend());
 284 }
 285
 286 template<class ForwardIterator>
 287 auto excess_kurtosis(ForwardIterator first, ForwardIterator last)
 288 {
 289     return kurtosis(first, last) - 3;
 290 }
 291
 292 template<class Container>
 293 inline auto excess_kurtosis(Container const & v)
 294 {
 295     return excess_kurtosis(v.cbegin(), v.cend());
 296 }
 297
 298
 299 template<class RandomAccessIterator>
 300 auto median(RandomAccessIterator first, RandomAccessIterator last)
 301 {
 302     size_t num_elems = std::distance(first, last);
 303     BOOST_ASSERT_MSG(num_elems > 0, "The median of a zero length vector is undefined.");
 304     if (num_elems & 1)
 305     {
 306         auto middle = first + (num_elems - 1)/2;
 307         std::nth_element(first, middle, last);
 308         return *middle;
 309     }
 310     else
 311     {
 312         auto middle = first + num_elems/2 - 1;
 313         std::nth_element(first, middle, last);
 314         std::nth_element(middle, middle+1, last);
 315         return (*middle + *(middle+1))/2;
 316     }
 317 }
 318
 319
 320 template<class RandomAccessContainer>
 321 inline auto median(RandomAccessContainer & v)
 322 {
 323     return median(v.begin(), v.end());
 324 }
 325
 326 template<class RandomAccessIterator>
 327 auto gini_coefficient(RandomAccessIterator first, RandomAccessIterator last)
 328 {
 329     using Real = typename std::iterator_traits<RandomAccessIterator>::value_type;
 330     BOOST_ASSERT_MSG(first != last && std::next(first) != last, "Computation of the Gini coefficient requires at least two samples.");
 331
 332     std::sort(first, last);
 333     if constexpr (std::is_integral<Real>::value)
 334     {
 335         double i = 1;
 336         double num = 0;
 337         double denom = 0;
 338         for (auto it = first; it != last; ++it)
 339         {
 340             num += *it*i;
 341             denom += *it;
 342             ++i;
 343         }
 344
 345         // If the l1 norm is zero, all elements are zero, so every element is the same.
 346         if (denom == 0)
 347         {
 348             return double(0);
 349         }
 350
 351         return ((2*num)/denom - i)/(i-1);
 352     }
 353     else
 354     {
 355         Real i = 1;
 356         Real num = 0;
 357         Real denom = 0;
 358         for (auto it = first; it != last; ++it)
 359         {
 360             num += *it*i;
 361             denom += *it;
 362             ++i;
 363         }
 364
 365         // If the l1 norm is zero, all elements are zero, so every element is the same.
 366         if (denom == 0)
 367         {
 368             return Real(0);
 369         }
 370
 371         return ((2*num)/denom - i)/(i-1);
 372     }
 373 }
 374
 375 template<class RandomAccessContainer>
 376 inline auto gini_coefficient(RandomAccessContainer & v)
 377 {
 378     return gini_coefficient(v.begin(), v.end());
 379 }
 380
 381 template<class RandomAccessIterator>
 382 inline auto sample_gini_coefficient(RandomAccessIterator first, RandomAccessIterator last)
 383 {
 384     size_t n = std::distance(first, last);
 385     return n*gini_coefficient(first, last)/(n-1);
 386 }
 387
 388 template<class RandomAccessContainer>
 389 inline auto sample_gini_coefficient(RandomAccessContainer & v)
 390 {
 391     return sample_gini_coefficient(v.begin(), v.end());
 392 }
 393
 394 template<class RandomAccessIterator>
 395 auto median_absolute_deviation(RandomAccessIterator first, RandomAccessIterator last, typename std::iterator_traits<RandomAccessIterator>::value_type center=std::numeric_limits<typename std::iterator_traits<RandomAccessIterator>::value_type>::quiet_NaN())
 396 {
 397     using std::abs;
 398     using Real = typename std::iterator_traits<RandomAccessIterator>::value_type;
 399     using std::isnan;
 400     if (isnan(center))
 401     {
 402         center = boost::math::tools::median(first, last);
 403     }
 404     size_t num_elems = std::distance(first, last);
 405     BOOST_ASSERT_MSG(num_elems > 0, "The median of a zero-length vector is undefined.");
 406     auto comparator = [&center](Real a, Real b) { return abs(a-center) < abs(b-center);};
 407     if (num_elems & 1)
 408     {
 409         auto middle = first + (num_elems - 1)/2;
 410         std::nth_element(first, middle, last, comparator);
 411         return abs(*middle);
 412     }
 413     else
 414     {
 415         auto middle = first + num_elems/2 - 1;
 416         std::nth_element(first, middle, last, comparator);
 417         std::nth_element(middle, middle+1, last, comparator);
 418         return (abs(*middle) + abs(*(middle+1)))/abs(static_cast<Real>(2));
 419     }
 420 }
 421
 422 template<class RandomAccessContainer>
 423 inline auto median_absolute_deviation(RandomAccessContainer & v, typename RandomAccessContainer::value_type center=std::numeric_limits<typename RandomAccessContainer::value_type>::quiet_NaN())
 424 {
 425     return median_absolute_deviation(v.begin(), v.end(), center);
 426 }
 427
 428 }
 429 #endif