libs/math/doc/distributions/binomial.qbk

   1 [section:binomial_dist Binomial Distribution]
   2
   3 ``#include <boost/math/distributions/binomial.hpp>``
   4
   5    namespace boost{ namespace math{
   6
   7    template <class RealType = double,
   8              class ``__Policy``   = ``__policy_class`` >
   9    class binomial_distribution;
  10
  11    typedef binomial_distribution<> binomial;
  12
  13    template <class RealType, class ``__Policy``>
  14    class binomial_distribution
  15    {
  16    public:
  17       typedef RealType  value_type;
  18       typedef Policy    policy_type;
  19
  20       static const ``['unspecified-type]`` clopper_pearson_exact_interval;
  21       static const ``['unspecified-type]`` jeffreys_prior_interval;
  22
  23       // construct:
  24       binomial_distribution(RealType n, RealType p);
  25
  26       // parameter access::
  27       RealType success_fraction() const;
  28       RealType trials() const;
  29
  30       // Bounds on success fraction:
  31       static RealType find_lower_bound_on_p(
  32          RealType trials,
  33          RealType successes,
  34          RealType probability,
  35          ``['unspecified-type]`` method = clopper_pearson_exact_interval);
  36       static RealType find_upper_bound_on_p(
  37          RealType trials,
  38          RealType successes,
  39          RealType probability,
  40          ``['unspecified-type]`` method = clopper_pearson_exact_interval);
  41
  42       // estimate min/max number of trials:
  43       static RealType find_minimum_number_of_trials(
  44          RealType k,     // number of events
  45          RealType p,     // success fraction
  46          RealType alpha); // risk level
  47
  48       static RealType find_maximum_number_of_trials(
  49          RealType k,     // number of events
  50          RealType p,     // success fraction
  51          RealType alpha); // risk level
  52    };
  53
  54    }} // namespaces
  55
  56 The class type `binomial_distribution` represents a
  57 [@http://mathworld.wolfram.com/BinomialDistribution.html binomial distribution]:
  58 it is used when there are exactly two mutually
  59 exclusive outcomes of a trial. These outcomes are labelled
  60 "success" and "failure". The
  61 __binomial_distrib is used to obtain
  62 the probability of observing k successes in N trials, with the
  63 probability of success on a single trial denoted by p. The
  64 binomial distribution assumes that p is fixed for all trials.
  65
  66 [note The random variable for the binomial distribution is the number of successes,
  67 (the number of trials is a fixed property of the distribution)
  68 whereas for the negative binomial,
  69 the random variable is the number of trials, for a fixed number of successes.]
  70
  71 The PDF for the binomial distribution is given by:
  72
  73 [equation binomial_ref2]
  74
  75 The following two graphs illustrate how the PDF changes depending
  76 upon the distributions parameters, first we'll keep the success
  77 fraction /p/ fixed at 0.5, and vary the sample size:
  78
  79 [graph binomial_pdf_1]
  80
  81 Alternatively, we can keep the sample size fixed at N=20 and
  82 vary the success fraction /p/:
  83
  84 [graph binomial_pdf_2]
  85
  86 [discrete_quantile_warning Binomial]
  87
  88 [h4 Member Functions]
  89
  90 [h5 Construct]
  91
  92    binomial_distribution(RealType n, RealType p);
  93
  94 Constructor: /n/ is the total number of trials, /p/ is the
  95 probability of success of a single trial.
  96
  97 Requires `0 <= p <= 1`, and `n >= 0`, otherwise calls __domain_error.
  98
  99 [h5 Accessors]
 100
 101    RealType success_fraction() const;
 102
 103 Returns the parameter /p/ from which this distribution was constructed.
 104
 105    RealType trials() const;
 106
 107 Returns the parameter /n/ from which this distribution was constructed.
 108
 109 [h5 Lower Bound on the Success Fraction]
 110
 111    static RealType find_lower_bound_on_p(
 112       RealType trials,
 113       RealType successes,
 114       RealType alpha,
 115       ``['unspecified-type]`` method = clopper_pearson_exact_interval);
 116
 117 Returns a lower bound on the success fraction:
 118
 119 [variablelist
 120 [[trials][The total number of trials conducted.]]
 121 [[successes][The number of successes that occurred.]]
 122 [[alpha][The largest acceptable probability that the true value of
 123          the success fraction is [*less than] the value returned.]]
 124 [[method][An optional parameter that specifies the method to be used
 125          to compute the interval (See below).]]
 126 ]
 127
 128 For example, if you observe /k/ successes from /n/ trials the
 129 best estimate for the success fraction is simply ['k/n], but if you
 130 want to be 95% sure that the true value is [*greater than] some value,
 131 ['p[sub min]], then:
 132
 133    p``[sub min]`` = binomial_distribution<RealType>::find_lower_bound_on_p(n, k, 0.05);
 134
 135 [link math_toolkit.stat_tut.weg.binom_eg.binom_conf See worked example.]
 136
 137 There are currently two possible values available for the /method/
 138 optional parameter: /clopper_pearson_exact_interval/
 139 or /jeffreys_prior_interval/.  These constants are both members of
 140 class template `binomial_distribution`, so usage is for example:
 141
 142    p = binomial_distribution<RealType>::find_lower_bound_on_p(
 143        n, k, 0.05, binomial_distribution<RealType>::jeffreys_prior_interval);
 144
 145 The default method if this parameter is not specified is the Clopper Pearson
 146 "exact" interval.  This produces an interval that guarantees at least
 147 `100(1-alpha)%` coverage, but which is known to be overly conservative,
 148 sometimes producing intervals with much greater than the requested coverage.
 149
 150 The alternative calculation method produces a non-informative
 151 Jeffreys Prior interval.  It produces `100(1-alpha)%` coverage only
 152 ['in the average case], though is typically very close to the requested
 153 coverage level.  It is one of the main methods of calculation recommended
 154 in the review by Brown, Cai and DasGupta.
 155
 156 Please note that the "textbook" calculation method using
 157 a normal approximation (the Wald interval) is deliberately
 158 not provided: it is known to produce consistently poor results,
 159 even when the sample size is surprisingly large.
 160 Refer to Brown, Cai and DasGupta for a full explanation.  Many other methods
 161 of calculation are available, and may be more appropriate for specific
 162 situations.  Unfortunately there appears to be no consensus amongst
 163 statisticians as to which is "best": refer to the discussion at the end of
 164 Brown, Cai and DasGupta for examples.
 165
 166 The two methods provided here were chosen principally because they
 167 can be used for both one and two sided intervals.
 168 See also:
 169
 170 Lawrence D. Brown, T. Tony Cai and Anirban DasGupta (2001),
 171 Interval Estimation for a Binomial Proportion,
 172 Statistical Science, Vol. 16, No. 2, 101-133.
 173
 174 T. Tony Cai (2005),
 175 One-sided confidence intervals in discrete distributions,
 176 Journal of Statistical Planning and Inference 131, 63-88.
 177
 178 Agresti, A. and Coull, B. A. (1998). Approximate is better than
 179 "exact" for interval estimation of binomial proportions. Amer.
 180 Statist. 52 119-126.
 181
 182 Clopper, C. J. and Pearson, E. S. (1934). The use of confidence
 183 or fiducial limits illustrated in the case of the binomial.
 184 Biometrika 26 404-413.
 185
 186 [h5 Upper Bound on the Success Fraction]
 187
 188    static RealType find_upper_bound_on_p(
 189       RealType trials,
 190       RealType successes,
 191       RealType alpha,
 192       ``['unspecified-type]`` method = clopper_pearson_exact_interval);
 193
 194 Returns an upper bound on the success fraction:
 195
 196 [variablelist
 197 [[trials][The total number of trials conducted.]]
 198 [[successes][The number of successes that occurred.]]
 199 [[alpha][The largest acceptable probability that the true value of
 200          the success fraction is [*greater than] the value returned.]]
 201 [[method][An optional parameter that specifies the method to be used
 202          to compute the interval. Refer to the documentation for
 203          `find_upper_bound_on_p` above for the meaning of the
 204          method options.]]
 205 ]
 206
 207 For example, if you observe /k/ successes from /n/ trials the
 208 best estimate for the success fraction is simply ['k/n], but if you
 209 want to be 95% sure that the true value is [*less than] some value,
 210 ['p[sub max]], then:
 211
 212    p``[sub max]`` = binomial_distribution<RealType>::find_upper_bound_on_p(n, k, 0.05);
 213
 214 [link math_toolkit.stat_tut.weg.binom_eg.binom_conf See worked example.]
 215
 216 [note
 217 In order to obtain a two sided bound on the success fraction, you
 218 call both `find_lower_bound_on_p` *and* `find_upper_bound_on_p`
 219 each with the same arguments.
 220
 221 If the desired risk level
 222 that the true success fraction lies outside the bounds is [alpha],
 223 then you pass [alpha]/2 to these functions.
 224
 225 So for example a two sided 95% confidence interval would be obtained
 226 by passing [alpha] = 0.025 to each of the functions.
 227
 228 [link math_toolkit.stat_tut.weg.binom_eg.binom_conf See worked example.]
 229 ]
 230
 231
 232 [h5 Estimating the Number of Trials Required for a Certain Number of Successes]
 233
 234    static RealType find_minimum_number_of_trials(
 235       RealType k,     // number of events
 236       RealType p,     // success fraction
 237       RealType alpha); // probability threshold
 238
 239 This function estimates the minimum number of trials required to ensure that
 240 more than k events is observed with a level of risk /alpha/ that k or
 241 fewer events occur.
 242
 243 [variablelist
 244 [[k][The number of success observed.]]
 245 [[p][The probability of success for each trial.]]
 246 [[alpha][The maximum acceptable probability that k events or fewer will be observed.]]
 247 ]
 248
 249 For example:
 250
 251    binomial_distribution<RealType>::find_number_of_trials(10, 0.5, 0.05);
 252
 253 Returns the smallest number of trials we must conduct to be 95% sure
 254 of seeing 10 events that occur with frequency one half.
 255
 256 [h5 Estimating the Maximum Number of Trials to Ensure no more than a Certain Number of Successes]
 257
 258    static RealType find_maximum_number_of_trials(
 259       RealType k,     // number of events
 260       RealType p,     // success fraction
 261       RealType alpha); // probability threshold
 262
 263 This function estimates the maximum number of trials we can conduct
 264 to ensure that k successes or fewer are observed, with a risk /alpha/
 265 that more than k occur.
 266
 267 [variablelist
 268 [[k][The number of success observed.]]
 269 [[p][The probability of success for each trial.]]
 270 [[alpha][The maximum acceptable probability that more than k events will be observed.]]
 271 ]
 272
 273 For example:
 274
 275    binomial_distribution<RealType>::find_maximum_number_of_trials(0, 1e-6, 0.05);
 276
 277 Returns the largest number of trials we can conduct and still be 95% certain
 278 of not observing any events that occur with one in a million frequency.
 279 This is typically used in failure analysis.
 280
 281 [link math_toolkit.stat_tut.weg.binom_eg.binom_size_eg See Worked Example.]
 282
 283 [h4 Non-member Accessors]
 284
 285 All the [link math_toolkit.dist_ref.nmp usual non-member accessor functions]
 286 that are generic to all distributions are supported: __usual_accessors.
 287
 288 The domain for the random variable /k/ is `0 <= k <= N`, otherwise a
 289 __domain_error is returned.
 290
 291 It's worth taking a moment to define what these accessors actually mean in
 292 the context of this distribution:
 293
 294 [table Meaning of the non-member accessors
 295 [[Function][Meaning]]
 296 [[__pdf]
 297    [The probability of obtaining [*exactly k successes] from n trials
 298    with success fraction p.  For example:
 299
 300 `pdf(binomial(n, p), k)`]]
 301 [[__cdf]
 302    [The probability of obtaining [*k successes or fewer] from n trials
 303    with success fraction p.  For example:
 304
 305 `cdf(binomial(n, p), k)`]]
 306 [[__ccdf]
 307    [The probability of obtaining [*more than k successes] from n trials
 308    with success fraction p.  For example:
 309
 310 `cdf(complement(binomial(n, p), k))`]]
 311 [[__quantile]
 312    [Given a binomial distribution with ['n] trials, success fraction ['p] and probability ['P],
 313    finds the largest number of successes ['k] whose CDF is less than ['P].
 314    It is strongly recommended that you read the tutorial
 315    [link math_toolkit.pol_tutorial.understand_dis_quant Understanding Quantiles of Discrete Distributions] before
 316    using the quantile function.]]
 317 [[__quantile_c]
 318    [Given a binomial distribution with ['n] trials, success fraction ['p] and probability ['Q],
 319    finds the smallest number of successes ['k] whose CDF is greater than ['1-Q].
 320    It is strongly recommended that you read the tutorial
 321    [link math_toolkit.pol_tutorial.understand_dis_quant Understanding Quantiles of Discrete Distributions] before
 322    using the quantile function.]]
 323 ]
 324
 325 [h4 Examples]
 326
 327 Various [link math_toolkit.stat_tut.weg.binom_eg worked examples]
 328 are available illustrating the use of the binomial distribution.
 329
 330 [h4 Accuracy]
 331
 332 This distribution is implemented using the
 333 incomplete beta functions __ibeta and __ibetac,
 334 please refer to these functions for information on accuracy.
 335
 336 [h4 Implementation]
 337
 338 In the following table /p/ is the probability that one trial will
 339 be successful (the success fraction), /n/ is the number of trials,
 340 /k/ is the number of successes, /p/ is the probability and /q = 1-p/.
 341
 342 [table
 343 [[Function][Implementation Notes]]
 344 [[pdf][Implementation is in terms of __ibeta_derivative: if [sub n]C[sub k ] is the binomial
 345        coefficient of a and b, then we have:
 346
 347 [equation binomial_ref1]
 348
 349 Which can be evaluated as `ibeta_derivative(k+1, n-k+1, p) / (n+1)`
 350
 351 The function __ibeta_derivative is used here, since it has already
 352        been optimised for the lowest possible error - indeed this is really
 353        just a thin wrapper around part of the internals of the incomplete
 354        beta function.
 355
 356 There are also various special cases: refer to the code for details.
 357        ]]
 358 [[cdf][Using the relation:
 359
 360 ``
 361 p = I[sub 1-p](n - k, k + 1)
 362   = 1 - I[sub p](k + 1, n - k)
 363   = __ibetac(k + 1, n - k, p)``
 364
 365 There are also various special cases: refer to the code for details.
 366 ]]
 367 [[cdf complement][Using the relation: q = __ibeta(k + 1, n - k, p)
 368
 369 There are also various special cases: refer to the code for details. ]]
 370 [[quantile][Since the cdf is non-linear in variate /k/ none of the inverse
 371             incomplete beta functions can be used here.  Instead the quantile
 372             is found numerically using a derivative free method
 373             (__root_finding_TOMS748).]]
 374 [[quantile from the complement][Found numerically as above.]]
 375 [[mean][ `p * n` ]]
 376 [[variance][ `p * n * (1-p)` ]]
 377 [[mode][`floor(p * (n + 1))`]]
 378 [[skewness][`(1 - 2 * p) / sqrt(n * p * (1 - p))`]]
 379 [[kurtosis][`3 - (6 / n) + (1 / (n * p * (1 - p)))`]]
 380 [[kurtosis excess][`(1 - 6 * p * q) / (n * p * q)`]]
 381 [[parameter estimation][The member functions `find_upper_bound_on_p`
 382        `find_lower_bound_on_p` and `find_number_of_trials` are
 383        implemented in terms of the inverse incomplete beta functions
 384        __ibetac_inv, __ibeta_inv, and __ibetac_invb respectively]]
 385 ]
 386
 387 [h4 References]
 388
 389 * [@http://mathworld.wolfram.com/BinomialDistribution.html Weisstein, Eric W. "Binomial Distribution." From MathWorld--A Wolfram Web Resource].
 390 * [@http://en.wikipedia.org/wiki/Beta_distribution Wikipedia binomial distribution].
 391 * [@http://www.itl.nist.gov/div898/handbook/eda/section3/eda366i.htm  NIST Explorary Data Analysis].
 392
 393 [endsect] [/section:binomial_dist Binomial]
 394
 395 [/ binomial.qbk
 396   Copyright 2006 John Maddock and Paul A. Bristow.
 397   Distributed under the Boost Software License, Version 1.0.
 398   (See accompanying file LICENSE_1_0.txt or copy at
 399   http://www.boost.org/LICENSE_1_0.txt).
 400 ]