1 /* statlib.c -- Statistical functions for testing the randomness of
5 Copyright 1999, 2000 Free Software Foundation, Inc.
7 This file is part of the GNU MP Library.
9 The GNU MP Library is free software; you can redistribute it and/or modify
10 it under the terms of the GNU Lesser General Public License as published by
11 the Free Software Foundation; either version 3 of the License, or (at your
12 option) any later version.
14 The GNU MP Library is distributed in the hope that it will be useful, but
15 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
16 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
17 License for more details.
19 You should have received a copy of the GNU Lesser General Public License
20 along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
22 /* The theories for these functions are taken from D. Knuth's "The Art
23 of Computer Programming: Volume 2, Seminumerical Algorithms", Third
24 Edition, Addison Wesley, 1998. */
26 /* Implementation notes.
28 The Kolmogorov-Smirnov test.
30 Eq. (13) in Knuth, p. 50, says that if X1, X2, ..., Xn are independent
31 observations arranged into ascending order
33 Kp = sqr(n) * max(j/n - F(Xj)) for all 1<=j<=n
34 Km = sqr(n) * max(F(Xj) - (j-1)/n)) for all 1<=j<=n
36 where F(x) = Pr(X <= x) = probability that (X <= x), which for a
37 uniformly distributed random real number between zero and one is
38 exactly the number itself (x).
41 The answer to exercise 23 gives the following implementation, which
42 doesn't need the observations to be sorted in ascending order:
44 for (k = 0; k < m; k++)
49 for (each observation Xj)
58 for (k = 0; k < m; k++)
60 rm = max (rm, a[k] - j/n)
62 rp = max (rp, j/n - b[k])
76 /* ks (Kp, Km, X, P, n) -- Perform a Kolmogorov-Smirnov test on the N
77 real numbers between zero and one in vector X. P is the
78 distribution function, called for each entry in X, which should
79 calculate the probability of X being greater than or equal to any
80 number in the sequence. (For a uniformly distributed sequence of
81 real numbers between zero and one, this is simply equal to X.) The
82 result is put in Kp and Km. */
88 void (P) (mpf_t, mpf_t),
94 mpf_t f_jnq; /* j/n or (j-1)/n */
97 /* Sort the vector in ascending order. */
98 qsort (X, n, sizeof (__mpf_struct), mpf_cmp);
101 /* Kp = sqr(n) * max(j/n - F(Xj)) for all 1<=j<=n
102 Km = sqr(n) * max(F(Xj) - (j-1)/n)) for all 1<=j<=n
105 mpf_init (Kt); mpf_init (f_x); mpf_init (f_j); mpf_init (f_jnq);
106 mpf_set_ui (Kp, 0); mpf_set_ui (Km, 0);
107 for (j = 1; j <= n; j++)
112 mpf_div_ui (f_jnq, f_j, n);
113 mpf_sub (Kt, f_jnq, f_x);
114 if (mpf_cmp (Kt, Kp) > 0)
116 if (g_debug > DEBUG_2)
118 printf ("j=%lu ", j);
119 printf ("P()="); mpf_out_str (stdout, 10, 2, f_x); printf ("\t");
121 printf ("jnq="); mpf_out_str (stdout, 10, 2, f_jnq); printf (" ");
122 printf ("diff="); mpf_out_str (stdout, 10, 2, Kt); printf (" ");
123 printf ("Kp="); mpf_out_str (stdout, 10, 2, Kp); printf ("\t");
125 mpf_sub_ui (f_j, f_j, 1);
126 mpf_div_ui (f_jnq, f_j, n);
127 mpf_sub (Kt, f_x, f_jnq);
128 if (mpf_cmp (Kt, Km) > 0)
131 if (g_debug > DEBUG_2)
133 printf ("jnq="); mpf_out_str (stdout, 10, 2, f_jnq); printf (" ");
134 printf ("diff="); mpf_out_str (stdout, 10, 2, Kt); printf (" ");
135 printf ("Km="); mpf_out_str (stdout, 10, 2, Km); printf (" ");
140 mpf_mul (Kp, Kp, Kt);
141 mpf_mul (Km, Km, Kt);
143 mpf_clear (Kt); mpf_clear (f_x); mpf_clear (f_j); mpf_clear (f_jnq);
146 /* ks_table(val, n) -- calculate probability for Kp/Km less than or
147 equal to VAL with N observations. See [Knuth section 3.3.1] */
150 ks_table (mpf_t p, mpf_t val, const unsigned int n)
152 /* We use Eq. (27), Knuth p.58, skipping O(1/n) for simplicity.
153 This shortcut will result in too high probabilities, especially
156 Pr(Kp(n) <= s) = 1 - pow(e, -2*s^2) * (1 - 2/3*s/sqrt(n) + O(1/n)) */
158 /* We have 's' in variable VAL and store the result in P. */
162 mpf_init (t1); mpf_init (t2);
164 /* t1 = 1 - 2/3 * s/sqrt(n) */
166 mpf_div (t1, val, t1);
167 mpf_mul_ui (t1, t1, 2);
168 mpf_div_ui (t1, t1, 3);
169 mpf_ui_sub (t1, 1, t1);
171 /* t2 = pow(e, -2*s^2) */
173 mpf_pow_ui (t2, val, 2); /* t2 = s^2 */
174 mpf_set_d (t2, exp (-(2.0 * mpf_get_d (t2))));
176 /* hmmm, gmp doesn't have pow() for floats. use doubles. */
177 mpf_set_d (t2, pow (M_E, -(2 * pow (mpf_get_d (val), 2))));
180 /* p = 1 - t1 * t2 */
181 mpf_mul (t1, t1, t2);
182 mpf_ui_sub (p, 1, t1);
184 mpf_clear (t1); mpf_clear (t2);
187 static double x2_table_X[][7] = {
188 { -2.33, -1.64, -.674, 0.0, 0.674, 1.64, 2.33 }, /* x */
189 { 5.4289, 2.6896, .454276, 0.0, .454276, 2.6896, 5.4289} /* x^2 */
192 #define _2D3 ((double) .6666666666)
194 /* x2_table (t, v, n) -- return chi-square table row for V in T[]. */
196 x2_table (double t[],
202 /* FIXME: Do a table lookup for v <= 30 since the following formula
203 [Knuth, vol 2, 3.3.1] is only good for v > 30. */
205 /* value = v + sqrt(2*v) * X[p] + (2/3) * X[p]^2 - 2/3 + O(1/sqrt(t) */
206 /* NOTE: The O() term is ignored for simplicity. */
208 for (f = 0; f < 7; f++)
211 sqrt (2 * v) * x2_table_X[0][f] +
212 _2D3 * x2_table_X[1][f] - _2D3;
216 /* P(p, x) -- Distribution function. Calculate the probability of X
217 being greater than or equal to any number in the sequence. For a
218 random real number between zero and one given by a uniformly
219 distributed random number generator, this is simply equal to X. */
227 /* mpf_freqt() -- Frequency test using KS on N real numbers between zero
228 and one. See [Knuth vol 2, p.61]. */
233 const unsigned long int n)
235 ks (Kp, Km, X, P, n);
239 /* The Chi-square test. Eq. (8) in Knuth vol. 2 says that if Y[]
240 holds the observations and p[] is the probability for.. (to be
243 V = 1/n * sum((s=1 to k) Y[s]^2 / p[s]) - n */
246 x2 (mpf_t V, /* result */
247 unsigned long int X[], /* data */
248 unsigned int k, /* #of categories */
249 void (P) (mpf_t, unsigned long int, void *), /* probability func */
250 void *x, /* extra user data passed to P() */
251 unsigned long int n) /* #of samples */
254 mpf_t f_t, f_t2; /* temp floats */
256 mpf_init (f_t); mpf_init (f_t2);
260 for (f = 0; f < k; f++)
262 if (g_debug > DEBUG_2)
263 fprintf (stderr, "%u: P()=", f);
264 mpf_set_ui (f_t, X[f]);
265 mpf_mul (f_t, f_t, f_t); /* f_t = X[f]^2 */
266 P (f_t2, f, x); /* f_t2 = Pr(f) */
267 if (g_debug > DEBUG_2)
268 mpf_out_str (stderr, 10, 2, f_t2);
269 mpf_div (f_t, f_t, f_t2);
271 if (g_debug > DEBUG_2)
273 fprintf (stderr, "\tV=");
274 mpf_out_str (stderr, 10, 2, V);
275 fprintf (stderr, "\t");
278 if (g_debug > DEBUG_2)
279 fprintf (stderr, "\n");
280 mpf_div_ui (V, V, n);
281 mpf_sub_ui (V, V, n);
283 mpf_clear (f_t); mpf_clear (f_t2);
286 /* Pzf(p, s, x) -- Probability for category S in mpz_freqt(). It's
287 1/d for all S. X is a pointer to an unsigned int holding 'd'. */
289 Pzf (mpf_t p, unsigned long int s, void *x)
292 mpf_div_ui (p, p, *((unsigned int *) x));
295 /* mpz_freqt(V, X, imax, n) -- Frequency test on integers. [Knuth,
296 vol 2, 3.3.2]. Keep IMAX low on this one, since we loop from 0 to
297 IMAX. 128 or 256 could be nice.
299 X[] must not contain numbers outside the range 0 <= X <= IMAX.
301 Return value is number of observations actually used, after
302 discarding entries out of range.
304 Since X[] contains integers between zero and IMAX, inclusive, we
305 have IMAX+1 categories.
307 Note that N should be at least 5*IMAX. Result is put in V and can
308 be compared to output from x2_table (v=IMAX). */
314 const unsigned long int n)
316 unsigned long int *v; /* result */
318 unsigned int d; /* number of categories = imax+1 */
320 unsigned long int usedn;
325 v = (unsigned long int *) calloc (imax + 1, sizeof (unsigned long int));
328 fprintf (stderr, "mpz_freqt(): out of memory\n");
333 usedn = n; /* actual number of observations */
334 for (f = 0; f < n; f++)
336 uitemp = mpz_get_ui(X[f]);
337 if (uitemp > imax) /* sanity check */
340 fprintf (stderr, "mpz_freqt(): warning: input insanity: %u, "\
341 "ignored.\n", uitemp);
348 if (g_debug > DEBUG_2)
350 fprintf (stderr, "counts:\n");
351 for (f = 0; f <= imax; f++)
352 fprintf (stderr, "%u:\t%lu\n", f, v[f]);
355 /* chi-square with k=imax+1 and P(x)=1/(imax+1) for all x.*/
356 x2 (V, v, d, Pzf, (void *) &d, usedn);
362 /* debug dummy to drag in dump funcs */
375 /* merit (rop, t, v, m) -- calculate merit for spectral test result in
376 dimension T, see Knuth p. 105. BUGS: Only valid for 2 <= T <=
379 merit (mpf_t rop, unsigned int t, mpf_t v, mpz_t m)
382 mpf_t f_m, f_const, f_pi;
386 mpf_init_set_d (f_const, M_PI);
387 mpf_init_set_d (f_pi, M_PI);
393 case 3: /* PI * 4/3 */
394 mpf_mul_ui (f_const, f_const, 4);
395 mpf_div_ui (f_const, f_const, 3);
397 case 4: /* PI^2 * 1/2 */
398 mpf_mul (f_const, f_const, f_pi);
399 mpf_div_ui (f_const, f_const, 2);
401 case 5: /* PI^2 * 8/15 */
402 mpf_mul (f_const, f_const, f_pi);
403 mpf_mul_ui (f_const, f_const, 8);
404 mpf_div_ui (f_const, f_const, 15);
406 case 6: /* PI^3 * 1/6 */
407 mpf_mul (f_const, f_const, f_pi);
408 mpf_mul (f_const, f_const, f_pi);
409 mpf_div_ui (f_const, f_const, 6);
413 "spect (merit): can't calculate merit for dimensions > 6\n");
414 mpf_set_ui (f_const, 0);
420 for (f = 1; f < t; f++)
421 mpf_mul (rop, rop, v);
422 mpf_mul (rop, rop, f_const);
423 mpf_div (rop, rop, f_m);
431 merit_u (unsigned int t, mpf_t v, mpz_t m)
437 merit (rop, t, v, m);
438 res = mpf_get_d (rop);
443 /* f_floor (rop, op) -- Set rop = floor (op). */
445 f_floor (mpf_t rop, mpf_t op)
451 /* No mpf_floor(). Convert to mpz and back. */
459 /* vz_dot (rop, v1, v2, nelem) -- compute dot product of z-vectors V1,
460 V2. N is number of elements in vectors V1 and V2. */
463 vz_dot (mpz_t rop, mpz_t V1[], mpz_t V2[], unsigned int n)
471 mpz_mul (t, V1[n], V2[n]);
472 mpz_add (rop, rop, t);
479 spectral_test (mpf_t rop[], unsigned int T, mpz_t a, mpz_t m)
481 /* Knuth "Seminumerical Algorithms, Third Edition", section 3.3.4
484 /* v[t] = min { sqrt (x[1]^2 + ... + x[t]^2) |
485 x[1] + a*x[2] + ... + pow (a, t-1) * x[t] is congruent to 0 (mod m) } */
490 unsigned int ui_i, ui_j, ui_k, ui_l;
491 mpf_t f_tmp1, f_tmp2;
492 mpz_t tmp1, tmp2, tmp3;
493 mpz_t U[GMP_SPECT_MAXT][GMP_SPECT_MAXT],
494 V[GMP_SPECT_MAXT][GMP_SPECT_MAXT],
498 mpz_t h, hp, r, s, p, pp, q, u, v;
503 for (ui_i = 0; ui_i < GMP_SPECT_MAXT; ui_i++)
505 for (ui_j = 0; ui_j < GMP_SPECT_MAXT; ui_j++)
507 mpz_init_set_ui (U[ui_i][ui_j], 0);
508 mpz_init_set_ui (V[ui_i][ui_j], 0);
510 mpz_init_set_ui (X[ui_i], 0);
511 mpz_init_set_ui (Y[ui_i], 0);
527 /* Implementation inits. */
528 if (T > GMP_SPECT_MAXT)
529 T = GMP_SPECT_MAXT; /* FIXME: Lazy. */
531 /* S1 [Initialize.] */
532 ui_t = 2 - 1; /* NOTE: `t' in description == ui_t + 1
539 mpz_pow_ui (s, a, 2);
540 mpz_add_ui (s, s, 1); /* s = 1 + a^2 */
542 /* S2 [Euclidean step.] */
545 if (g_debug > DEBUG_1)
547 mpz_mul (tmp1, h, pp);
548 mpz_mul (tmp2, hp, p);
549 mpz_sub (tmp1, tmp1, tmp2);
550 if (mpz_cmpabs (m, tmp1))
552 printf ("***BUG***: h*pp - hp*p = ");
553 mpz_out_str (stdout, 10, tmp1);
557 if (g_debug > DEBUG_2)
560 mpz_out_str (stdout, 10, hp);
562 mpz_out_str (stdout, 10, h);
568 mpz_tdiv_q (q, hp, h); /* q = floor(hp/h) */
572 if (g_debug > DEBUG_2)
575 mpz_out_str (stdout, 10, q);
580 mpz_mul (tmp1, q, h);
581 mpz_sub (u, hp, tmp1); /* u = hp - q*h */
583 mpz_mul (tmp1, q, p);
584 mpz_sub (v, pp, tmp1); /* v = pp - q*p */
586 mpz_pow_ui (tmp1, u, 2);
587 mpz_pow_ui (tmp2, v, 2);
588 mpz_add (tmp1, tmp1, tmp2);
589 if (mpz_cmp (tmp1, s) < 0)
591 mpz_set (s, tmp1); /* s = u^2 + v^2 */
592 mpz_set (hp, h); /* hp = h */
593 mpz_set (h, u); /* h = u */
594 mpz_set (pp, p); /* pp = p */
595 mpz_set (p, v); /* p = v */
601 /* S3 [Compute v2.] */
605 mpz_pow_ui (tmp1, u, 2);
606 mpz_pow_ui (tmp2, v, 2);
607 mpz_add (tmp1, tmp1, tmp2);
608 if (mpz_cmp (tmp1, s) < 0)
610 mpz_set (s, tmp1); /* s = u^2 + v^2 */
614 mpf_set_z (f_tmp1, s);
615 mpf_sqrt (rop[ui_t - 1], f_tmp1);
617 /* S4 [Advance t.] */
618 mpz_neg (U[0][0], h);
619 mpz_set (U[0][1], p);
620 mpz_neg (U[1][0], hp);
621 mpz_set (U[1][1], pp);
623 mpz_set (V[0][0], pp);
624 mpz_set (V[0][1], hp);
625 mpz_neg (V[1][0], p);
626 mpz_neg (V[1][1], h);
627 if (mpz_cmp_ui (pp, 0) > 0)
629 mpz_neg (V[0][0], V[0][0]);
630 mpz_neg (V[0][1], V[0][1]);
631 mpz_neg (V[1][0], V[1][0]);
632 mpz_neg (V[1][1], V[1][1]);
635 while (ui_t + 1 != T) /* S4 loop */
641 /* Add new row and column to U and V. They are initialized with
642 all elements set to zero, so clearing is not necessary. */
644 mpz_neg (U[ui_t][0], r); /* U: First col in new row. */
645 mpz_set_ui (U[ui_t][ui_t], 1); /* U: Last col in new row. */
647 mpz_set (V[ui_t][ui_t], m); /* V: Last col in new row. */
649 /* "Finally, for 1 <= i < t,
650 set q = round (vi1 * r / m),
654 for (ui_i = 0; ui_i < ui_t; ui_i++)
656 mpz_mul (tmp1, V[ui_i][0], r); /* tmp1=vi1*r */
657 zdiv_round (q, tmp1, m); /* q=round(vi1*r/m) */
658 mpz_mul (tmp2, q, m); /* tmp2=q*m */
659 mpz_sub (V[ui_i][ui_t], tmp1, tmp2);
661 for (ui_j = 0; ui_j <= ui_t; ui_j++) /* U[t] = U[t] + q*U[i] */
663 mpz_mul (tmp1, q, U[ui_i][ui_j]); /* tmp=q*uij */
664 mpz_add (U[ui_t][ui_j], U[ui_t][ui_j], tmp1); /* utj = utj + q*uij */
668 /* s = min (s, zdot (U[t], U[t]) */
669 vz_dot (tmp1, U[ui_t], U[ui_t], ui_t + 1);
670 if (mpz_cmp (tmp1, s) < 0)
674 ui_j = 0; /* WARNING: ui_j no longer a temp. */
676 /* S5 [Transform.] */
677 if (g_debug > DEBUG_2)
678 printf ("(t, k, j, q1, q2, ...)\n");
681 if (g_debug > DEBUG_2)
682 printf ("(%u, %u, %u", ui_t + 1, ui_k + 1, ui_j + 1);
684 for (ui_i = 0; ui_i <= ui_t; ui_i++)
688 vz_dot (tmp1, V[ui_i], V[ui_j], ui_t + 1); /* tmp1=dot(Vi,Vj). */
689 mpz_abs (tmp2, tmp1);
690 mpz_mul_ui (tmp2, tmp2, 2); /* tmp2 = 2*abs(dot(Vi,Vj) */
691 vz_dot (tmp3, V[ui_j], V[ui_j], ui_t + 1); /* tmp3=dot(Vj,Vj). */
693 if (mpz_cmp (tmp2, tmp3) > 0)
695 zdiv_round (q, tmp1, tmp3); /* q=round(Vi.Vj/Vj.Vj) */
696 if (g_debug > DEBUG_2)
699 mpz_out_str (stdout, 10, q);
702 for (ui_l = 0; ui_l <= ui_t; ui_l++)
704 mpz_mul (tmp1, q, V[ui_j][ui_l]);
705 mpz_sub (V[ui_i][ui_l], V[ui_i][ui_l], tmp1); /* Vi=Vi-q*Vj */
706 mpz_mul (tmp1, q, U[ui_i][ui_l]);
707 mpz_add (U[ui_j][ui_l], U[ui_j][ui_l], tmp1); /* Uj=Uj+q*Ui */
710 vz_dot (tmp1, U[ui_j], U[ui_j], ui_t + 1); /* tmp1=dot(Uj,Uj) */
711 if (mpz_cmp (tmp1, s) < 0) /* s = min(s,dot(Uj,Uj)) */
715 else if (g_debug > DEBUG_2)
716 printf (", #"); /* 2|Vi.Vj| <= Vj.Vj */
718 else if (g_debug > DEBUG_2)
719 printf (", *"); /* i == j */
722 if (g_debug > DEBUG_2)
725 /* S6 [Advance j.] */
731 while (ui_j != ui_k); /* S5 */
733 /* From Knuth p. 104: "The exhaustive search in steps S8-S10
734 reduces the value of s only rarely." */
736 /* S7 [Prepare for search.] */
737 /* Find minimum in (x[1], ..., x[t]) satisfying condition
738 x[k]^2 <= f(y[1], ...,y[t]) * dot(V[k],V[k]) */
741 if (g_debug > DEBUG_2)
743 printf ("searching...");
744 /*for (f = 0; f < ui_t*/
748 /* Z[i] = floor (sqrt (floor (dot(V[i],V[i]) * s / m^2))); */
749 mpz_pow_ui (tmp1, m, 2);
750 mpf_set_z (f_tmp1, tmp1);
751 mpf_set_z (f_tmp2, s);
752 mpf_div (f_tmp1, f_tmp2, f_tmp1); /* f_tmp1 = s/m^2 */
753 for (ui_i = 0; ui_i <= ui_t; ui_i++)
755 vz_dot (tmp1, V[ui_i], V[ui_i], ui_t + 1);
756 mpf_set_z (f_tmp2, tmp1);
757 mpf_mul (f_tmp2, f_tmp2, f_tmp1);
758 f_floor (f_tmp2, f_tmp2);
759 mpf_sqrt (f_tmp2, f_tmp2);
760 mpz_set_f (Z[ui_i], f_tmp2);
763 /* S8 [Advance X[k].] */
766 if (g_debug > DEBUG_2)
768 printf ("X[%u] = ", ui_k);
769 mpz_out_str (stdout, 10, X[ui_k]);
770 printf ("\tZ[%u] = ", ui_k);
771 mpz_out_str (stdout, 10, Z[ui_k]);
776 if (mpz_cmp (X[ui_k], Z[ui_k]))
778 mpz_add_ui (X[ui_k], X[ui_k], 1);
779 for (ui_i = 0; ui_i <= ui_t; ui_i++)
780 mpz_add (Y[ui_i], Y[ui_i], U[ui_k][ui_i]);
782 /* S9 [Advance k.] */
783 while (++ui_k <= ui_t)
785 mpz_neg (X[ui_k], Z[ui_k]);
786 mpz_mul_ui (tmp1, Z[ui_k], 2);
787 for (ui_i = 0; ui_i <= ui_t; ui_i++)
789 mpz_mul (tmp2, tmp1, U[ui_k][ui_i]);
790 mpz_sub (Y[ui_i], Y[ui_i], tmp2);
793 vz_dot (tmp1, Y, Y, ui_t + 1);
794 if (mpz_cmp (tmp1, s) < 0)
799 #endif /* DO_SEARCH */
800 mpf_set_z (f_tmp1, s);
801 mpf_sqrt (rop[ui_t - 1], f_tmp1);
803 if (g_debug > DEBUG_2)
805 #endif /* DO_SEARCH */
808 /* Clear GMP variables. */
812 for (ui_i = 0; ui_i < GMP_SPECT_MAXT; ui_i++)
814 for (ui_j = 0; ui_j < GMP_SPECT_MAXT; ui_j++)
816 mpz_clear (U[ui_i][ui_j]);
817 mpz_clear (V[ui_i][ui_j]);