Imported Upstream version 1.72.0
[platform/upstream/boost.git] / libs / math / doc / distributions / empirical_cdf.qbk
1 [/
2 Copyright (c) 2019 Nick Thompson
3 Use, modification and distribution are subject to the
4 Boost Software License, Version 1.0. (See accompanying file
5 LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
6 ]
7
8 [section:empirical_cdf Empirical Cumulative Distribution Function]
9
10 [heading Synopsis]
11
12 ```
13 #include <boost/math/distributions/empirical_cumulative_distribution_function.hpp>
14
15 namespace boost{ namespace math{
16
17 template <class RandomAccessContainer>
18 class empirical_cumulative_distribution_function
19 {
20 public:
21     using Real = typename RandomAccessContainer::value_type;
22     empirical_cumulative_distribution_function(RandomAccessContainer && v, bool sorted = false);
23
24     auto operator()(Real t) const;
25
26     RandomAccessContainer&& return_data();
27 };
28
29 }}
30 ```
31
32 [heading Empirical Cumulative Distribution Function]
33
34 The empirical cumulative distribution function is a step function constructed from observed data which converges to the true cumulative distribution function in the limit of infinite data.
35 This function is a basic building block of hypothesis testing workflows that attempt to answer the question "does my data come from a given distribution?"
36 These tests require computing quadratures over some function of the empirical CDF and the supposed CDF to create a distance measurement, and hence it is occasionally useful to construct a continuous callable from the data.
37
38 An example usage is demonstrated below:
39
40 ```
41 #include <vector>
42 #include <random>
43 #include <boost/math/distributions/empirical_cumulative_distribution_function.hpp>
44 using boost::math::empirical_cumulative_distribution_function;
45 std::random_device rd;
46 std::mt19937 gen{rd()};
47 std::normal_distribution<double> dis(0, 1);
48 size_t n = 128;
49 std::vector<double> v(n);
50 for (size_t i = 0; i < n; ++i) {
51   v[i] = dis(gen);
52 }
53
54 auto ecdf = empirical_cumulative_distribution_function(std::move(v));
55 std::cout << "ecdf(0.0) = " << ecdf(0.0) << "\n";
56 // should print approximately 0.5 . . .
57 ```
58
59 The empirical distribution function requires sorted data.
60 By default, the constructor sorts it for you at O(Nlog(N)) cost.
61
62 If your data is already sorted, you can specify this and the constructor simply moves your data into the class:
63
64 ```
65 std::sort(v.begin(), v.end());
66 auto ecdf = empirical_cumulative_distribution_function(std::move(v), /* already sorted = */ true);
67 ```
68
69 If you want your data back after being done with the object, use
70
71 ```
72 v = ecdf.return_data();
73 ```
74
75 This operation invalidates `ecdf`; it can no longer be used.
76
77 The call operator complexity is O(log(N)), as it requires a call to `std::upper_bound`.
78
79 Works with both integer and floating point types.
80 If the input data consists of integers, the output of the call operator is a double. Requires C++17.
81
82 [$../graphs/empiricial_cumulative_distribution_gauss.svg]
83
84 [$../graphs/empiricial_cumulative_distribution_uniform.svg]
85
86
87 [heading Performance]
88
89 ```
90 ------------------------------------------------------
91 Benchmark                                         Time
92 ------------------------------------------------------
93 ECDFConstructorSorted<double>/8                4.52 ns
94 ECDFConstructorSorted<double>/16               5.20 ns
95 ECDFConstructorSorted<double>/32               5.22 ns
96 ECDFConstructorSorted<double>/64               7.37 ns
97 ECDFConstructorSorted<double>/128              7.16 ns
98 ECDFConstructorSorted<double>/256              8.97 ns
99 ECDFConstructorSorted<double>/512              8.44 ns
100 ECDFConstructorSorted<double>/1024             9.07 ns
101 ECDFConstructorSorted<double>/2048             11.4 ns
102 ECDFConstructorSorted<double>/4096             12.6 ns
103 ECDFConstructorSorted<double>/8192             11.4 ns
104 ECDFConstructorSorted<double>/16384            16.0 ns
105 ECDFConstructorSorted<double>/32768            17.0 ns
106 ECDFConstructorSorted<double>/65536            19.5 ns
107 ECDFConstructorSorted<double>/131072           15.8 ns
108 ECDFConstructorSorted<double>/262144           17.9 ns
109 ECDFConstructorSorted<double>/524288           26.7 ns
110 ECDFConstructorSorted<double>/1048576          29.5 ns
111 ECDFConstructorSorted<double>/2097152          31.8 ns
112 ECDFConstructorSorted<double>/4194304          32.8 ns
113 ECDFConstructorSorted<double>/8388608          35.4 ns
114 ECDFConstructorSorted<double>/16777216         30.4 ns
115 ECDFConstructorSorted<double>_BigO             1.27 lgN
116 ECDFConstructorSorted<double>_RMS                20 %
117 ECDFConstructorUnsorted<double>/8               155 ns
118 ECDFConstructorUnsorted<double>/64             2095 ns
119 ECDFConstructorUnsorted<double>/512           22212 ns
120 ECDFConstructorUnsorted<double>/4096         220821 ns
121 ECDFConstructorUnsorted<double>/32768       1996380 ns
122 ECDFConstructorUnsorted<double>/262144     18916039 ns
123 ECDFConstructorUnsorted<double>/2097152   194250013 ns
124 ECDFConstructorUnsorted<double>/16777216 2281469424 ns
125 ECDFConstructorUnsorted<double>_BigO           5.65 NlgN
126 ECDFConstructorUnsorted<double>_RMS               6 %
127 Shuffle<double>/8                              82.4 ns
128 Shuffle<double>/64                              731 ns
129 Shuffle<double>/512                            5876 ns
130 Shuffle<double>/4096                          46864 ns
131 Shuffle<double>/32768                        385265 ns
132 Shuffle<double>/262144                      4663866 ns
133 Shuffle<double>/2097152                    54686332 ns
134 Shuffle<double>/16777216                  875309099 ns
135 Shuffle<double>_BigO                           2.16 NlgN
136 Shuffle<double>_RMS                              12 %
137 ECDFEvaluation<double>/8                       48.6 ns
138 ECDFEvaluation<double>/64                      61.3 ns
139 ECDFEvaluation<double>/512                     85.1 ns
140 ECDFEvaluation<double>/4096                     105 ns
141 ECDFEvaluation<double>/32768                    131 ns
142 ECDFEvaluation<double>/262144                   196 ns
143 ECDFEvaluation<double>/2097152                  391 ns
144 ECDFEvaluation<double>/16777216                 715 ns
145 ECDFEvaluation<double>_BigO                   18.19 lgN
146 ECDFEvaluation<double>_RMS                       60 %
147 ```
148
149 The call to the unsorted constructor is in fact a little faster than indicated, as the data must be shuffled after being sorted in the benchmark.
150 This is itself a fairly expensive operation.
151
152 [endsect]
153 [/section:empirical_cdf]