| /* statistic_tests.hpp header file |
| * |
| * Copyright Jens Maurer 2000 |
| * Distributed under the Boost Software License, Version 1.0. (See |
| * accompanying file LICENSE_1_0.txt or copy at |
| * http://www.boost.org/LICENSE_1_0.txt) |
| * |
| * $Id: statistic_tests.hpp 60755 2010-03-22 00:45:06Z steven_watanabe $ |
| * |
| */ |
| |
| #ifndef STATISTIC_TESTS_HPP |
| #define STATISTIC_TESTS_HPP |
| |
| #include <stdexcept> |
| #include <iterator> |
| #include <vector> |
| #include <boost/limits.hpp> |
| #include <algorithm> |
| #include <cmath> |
| |
| #include <boost/random.hpp> |
| #include <boost/config.hpp> |
| #include <boost/bind.hpp> |
| |
| #include "integrate.hpp" |
| |
| #if defined(BOOST_MSVC) && BOOST_MSVC <= 1300 |
| namespace std |
| { |
| inline double pow(double a, double b) { return ::pow(a,b); } |
| inline double ceil(double x) { return ::ceil(x); } |
| } // namespace std |
| #endif |
| |
| |
| template<class T> |
| inline T fac(int k) |
| { |
| T result = 1; |
| for(T i = 2; i <= k; ++i) |
| result *= i; |
| return result; |
| } |
| |
| template<class T> |
| T binomial(int n, int k) |
| { |
| if(k < n/2) |
| k = n-k; |
| T result = 1; |
| for(int i = k+1; i<= n; ++i) |
| result *= i; |
| return result / fac<T>(n-k); |
| } |
| |
| template<class T> |
| T stirling2(int n, int m) |
| { |
| T sum = 0; |
| for(int k = 0; k <= m; ++k) |
| sum += binomial<T>(m, k) * std::pow(double(k), n) * |
| ( (m-k)%2 == 0 ? 1 : -1); |
| return sum / fac<T>(m); |
| } |
| |
| /* |
| * Experiments which create an empirical distribution in classes, |
| * suitable for the chi-square test. |
| */ |
| // std::floor(gen() * classes) |
| |
| class experiment_base |
| { |
| public: |
| experiment_base(int cls) : _classes(cls) { } |
| unsigned int classes() const { return _classes; } |
| protected: |
| unsigned int _classes; |
| }; |
| |
| class equidistribution_experiment : public experiment_base |
| { |
| public: |
| explicit equidistribution_experiment(unsigned int classes) |
| : experiment_base(classes) { } |
| |
| template<class NumberGenerator, class Counter> |
| void run(NumberGenerator & f, Counter & count, int n) const |
| { |
| assert((f.min)() == 0 && |
| static_cast<unsigned int>((f.max)()) == classes()-1); |
| for(int i = 0; i < n; ++i) |
| count(f()); |
| } |
| double probability(int i) const { return 1.0/classes(); } |
| }; |
| |
| // two-dimensional equidistribution experiment |
| class equidistribution_2d_experiment : public equidistribution_experiment |
| { |
| public: |
| explicit equidistribution_2d_experiment(unsigned int classes) |
| : equidistribution_experiment(classes) { } |
| |
| template<class NumberGenerator, class Counter> |
| void run(NumberGenerator & f, Counter & count, int n) const |
| { |
| unsigned int range = (f.max)()+1; |
| assert((f.min)() == 0 && range*range == classes()); |
| for(int i = 0; i < n; ++i) { |
| int y1 = f(); |
| int y2 = f(); |
| count(y1 + range * y2); |
| } |
| } |
| }; |
| |
| // distribution experiment: assume a probability density and |
| // count events so that an equidistribution results. |
| class distribution_experiment : public equidistribution_experiment |
| { |
| public: |
| template<class Distribution> |
| distribution_experiment(Distribution dist , unsigned int classes) |
| : equidistribution_experiment(classes), limit(classes) |
| { |
| for(unsigned int i = 0; i < classes-1; ++i) |
| limit[i] = quantile(dist, (i+1)*0.05); |
| limit[classes-1] = std::numeric_limits<double>::infinity(); |
| if(limit[classes-1] < (std::numeric_limits<double>::max)()) |
| limit[classes-1] = (std::numeric_limits<double>::max)(); |
| #if 0 |
| std::cout << __PRETTY_FUNCTION__ << ": "; |
| for(unsigned int i = 0; i < classes; ++i) |
| std::cout << limit[i] << " "; |
| std::cout << std::endl; |
| #endif |
| } |
| |
| template<class NumberGenerator, class Counter> |
| void run(NumberGenerator & f, Counter & count, int n) const |
| { |
| for(int i = 0; i < n; ++i) { |
| limits_type::const_iterator it = |
| std::lower_bound(limit.begin(), limit.end(), f()); |
| count(it-limit.begin()); |
| } |
| } |
| private: |
| typedef std::vector<double> limits_type; |
| limits_type limit; |
| }; |
| |
| template<bool up, bool is_float> |
| struct runs_direction_helper |
| { |
| template<class T> |
| static T init(T) |
| { |
| return (std::numeric_limits<T>::max)(); |
| } |
| }; |
| |
| template<> |
| struct runs_direction_helper<true, true> |
| { |
| template<class T> |
| static T init(T) |
| { |
| return -(std::numeric_limits<T>::max)(); |
| } |
| }; |
| |
| template<> |
| struct runs_direction_helper<true, false> |
| { |
| template<class T> |
| static T init(T) |
| { |
| return (std::numeric_limits<T>::min)(); |
| } |
| }; |
| |
| // runs-up/runs-down experiment |
| template<bool up> |
| class runs_experiment : public experiment_base |
| { |
| public: |
| explicit runs_experiment(unsigned int classes) : experiment_base(classes) { } |
| |
| template<class NumberGenerator, class Counter> |
| void run(NumberGenerator & f, Counter & count, int n) const |
| { |
| typedef typename NumberGenerator::result_type result_type; |
| result_type init = |
| runs_direction_helper< |
| up, |
| !std::numeric_limits<result_type>::is_integer |
| >::init(result_type()); |
| result_type previous = init; |
| unsigned int length = 0; |
| for(int i = 0; i < n; ++i) { |
| result_type val = f(); |
| if(up ? previous <= val : previous >= val) { |
| previous = val; |
| ++length; |
| } else { |
| count((std::min)(length, classes())-1); |
| length = 0; |
| previous = init; |
| // don't use this value, so that runs are independent |
| } |
| } |
| } |
| double probability(unsigned int r) const |
| { |
| if(r == classes()-1) |
| return 1.0/fac<double>(classes()); |
| else |
| return static_cast<double>(r+1)/fac<double>(r+2); |
| } |
| }; |
| |
| // gap length experiment |
| class gap_experiment : public experiment_base |
| { |
| public: |
| template<class Dist> |
| gap_experiment(unsigned int classes, const Dist & dist, double alpha, double beta) |
| : experiment_base(classes), alpha(alpha), beta(beta), low(quantile(dist, alpha)), high(quantile(dist, beta)) {} |
| |
| template<class NumberGenerator, class Counter> |
| void run(NumberGenerator & f, Counter & count, int n) const |
| { |
| typedef typename NumberGenerator::result_type result_type; |
| unsigned int length = 0; |
| for(int i = 0; i < n; ) { |
| result_type value = f(); |
| if(value < low || value > high) |
| ++length; |
| else { |
| count((std::min)(length, classes()-1)); |
| length = 0; |
| ++i; |
| } |
| } |
| } |
| double probability(unsigned int r) const |
| { |
| double p = beta-alpha; |
| if(r == classes()-1) |
| return std::pow(1-p, static_cast<double>(r)); |
| else |
| return p * std::pow(1-p, static_cast<double>(r)); |
| } |
| private: |
| double alpha, beta; |
| double low, high; |
| }; |
| |
| // poker experiment |
| class poker_experiment : public experiment_base |
| { |
| public: |
| poker_experiment(unsigned int d, unsigned int k) |
| : experiment_base(k), range(d) |
| { |
| assert(range > 1); |
| } |
| |
| template<class UniformRandomNumberGenerator, class Counter> |
| void run(UniformRandomNumberGenerator & f, Counter & count, int n) const |
| { |
| typedef typename UniformRandomNumberGenerator::result_type result_type; |
| assert(std::numeric_limits<result_type>::is_integer); |
| assert((f.min)() == 0); |
| assert((f.max)() == static_cast<result_type>(range-1)); |
| std::vector<result_type> v(classes()); |
| for(int i = 0; i < n; ++i) { |
| for(unsigned int j = 0; j < classes(); ++j) |
| v[j] = f(); |
| std::sort(v.begin(), v.end()); |
| result_type prev = v[0]; |
| int r = 1; // count different values in v |
| for(unsigned int i = 1; i < classes(); ++i) { |
| if(prev != v[i]) { |
| prev = v[i]; |
| ++r; |
| } |
| } |
| count(r-1); |
| } |
| } |
| |
| double probability(unsigned int r) const |
| { |
| ++r; // transform to 1 <= r <= 5 |
| double result = range; |
| for(unsigned int i = 1; i < r; ++i) |
| result *= range-i; |
| return result / std::pow(range, static_cast<double>(classes())) * |
| stirling2<double>(classes(), r); |
| } |
| private: |
| unsigned int range; |
| }; |
| |
| // coupon collector experiment |
| class coupon_collector_experiment : public experiment_base |
| { |
| public: |
| coupon_collector_experiment(unsigned int d, unsigned int cls) |
| : experiment_base(cls), d(d) |
| { |
| assert(d > 1); |
| } |
| |
| template<class UniformRandomNumberGenerator, class Counter> |
| void run(UniformRandomNumberGenerator & f, Counter & count, int n) const |
| { |
| typedef typename UniformRandomNumberGenerator::result_type result_type; |
| assert(std::numeric_limits<result_type>::is_integer); |
| assert((f.min)() == 0); |
| assert((f.max)() == static_cast<result_type>(d-1)); |
| std::vector<bool> occurs(d); |
| for(int i = 0; i < n; ++i) { |
| occurs.assign(d, false); |
| unsigned int r = 0; // length of current sequence |
| int q = 0; // number of non-duplicates in current set |
| for(;;) { |
| result_type val = f(); |
| ++r; |
| if(!occurs[val]) { // new set element |
| occurs[val] = true; |
| ++q; |
| if(q == d) |
| break; // one complete set |
| } |
| } |
| count((std::min)(r-d, classes()-1)); |
| } |
| } |
| double probability(unsigned int r) const |
| { |
| if(r == classes()-1) |
| return 1-fac<double>(d)/std::pow(d, static_cast<double>(d+classes()-2))* |
| stirling2<double>(d+classes()-2, d); |
| else |
| return fac<double>(d)/std::pow(d, static_cast<double>(d+r)) * |
| stirling2<double>(d+r-1, d-1); |
| } |
| private: |
| int d; |
| }; |
| |
| // permutation test |
| class permutation_experiment : public equidistribution_experiment |
| { |
| public: |
| permutation_experiment(unsigned int t) |
| : equidistribution_experiment(fac<int>(t)), t(t) |
| { |
| assert(t > 1); |
| } |
| |
| template<class UniformRandomNumberGenerator, class Counter> |
| void run(UniformRandomNumberGenerator & f, Counter & count, int n) const |
| { |
| typedef typename UniformRandomNumberGenerator::result_type result_type; |
| std::vector<result_type> v(t); |
| for(int i = 0; i < n; ++i) { |
| for(int j = 0; j < t; ++j) { |
| v[j] = f(); |
| } |
| int x = 0; |
| for(int r = t-1; r > 0; r--) { |
| typename std::vector<result_type>::iterator it = |
| std::max_element(v.begin(), v.begin()+r+1); |
| x = (r+1)*x + (it-v.begin()); |
| std::iter_swap(it, v.begin()+r); |
| } |
| count(x); |
| } |
| } |
| private: |
| int t; |
| }; |
| |
| // birthday spacing experiment test |
| class birthday_spacing_experiment : public experiment_base |
| { |
| public: |
| birthday_spacing_experiment(unsigned int d, int n, int m) |
| : experiment_base(d), n(n), m(m) |
| { |
| } |
| |
| template<class UniformRandomNumberGenerator, class Counter> |
| void run(UniformRandomNumberGenerator & f, Counter & count, int n_total) const |
| { |
| typedef typename UniformRandomNumberGenerator::result_type result_type; |
| assert(std::numeric_limits<result_type>::is_integer); |
| assert((f.min)() == 0); |
| assert((f.max)() == static_cast<result_type>(m-1)); |
| |
| for(int j = 0; j < n_total; j++) { |
| std::vector<result_type> v(n); |
| std::generate_n(v.begin(), n, f); |
| std::sort(v.begin(), v.end()); |
| std::vector<result_type> spacing(n); |
| for(int i = 0; i < n-1; i++) |
| spacing[i] = v[i+1]-v[i]; |
| spacing[n-1] = v[0] + m - v[n-1]; |
| std::sort(spacing.begin(), spacing.end()); |
| unsigned int k = 0; |
| for(int i = 0; i < n-1; ++i) { |
| if(spacing[i] == spacing[i+1]) |
| ++k; |
| } |
| count((std::min)(k, classes()-1)); |
| } |
| } |
| |
| double probability(unsigned int r) const |
| { |
| assert(classes() == 4); |
| assert(m == (1<<25)); |
| assert(n == 512); |
| static const double prob[] = { 0.368801577, 0.369035243, 0.183471182, |
| 0.078691997 }; |
| return prob[r]; |
| } |
| private: |
| int n, m; |
| }; |
| /* |
| * Misc. helper functions. |
| */ |
| |
| template<class Float> |
| struct distribution_function |
| { |
| typedef Float result_type; |
| typedef Float argument_type; |
| typedef Float first_argument_type; |
| typedef Float second_argument_type; |
| }; |
| |
| // computes P(K_n <= t) or P(t1 <= K_n <= t2). See Knuth, 3.3.1 |
| class kolmogorov_smirnov_probability : public distribution_function<double> |
| { |
| public: |
| kolmogorov_smirnov_probability(int n) |
| : approx(n > 50), n(n), sqrt_n(std::sqrt(double(n))) |
| { |
| if(!approx) |
| n_n = std::pow(static_cast<double>(n), n); |
| } |
| |
| double cdf(double t) const |
| { |
| if(approx) { |
| return 1-std::exp(-2*t*t)*(1-2.0/3.0*t/sqrt_n); |
| } else { |
| t *= sqrt_n; |
| double sum = 0; |
| for(int k = static_cast<int>(std::ceil(t)); k <= n; k++) |
| sum += binomial<double>(n, k) * std::pow(k-t, k) * |
| std::pow(t+n-k, n-k-1); |
| return 1 - t/n_n * sum; |
| } |
| } |
| //double operator()(double t1, double t2) const |
| //{ return operator()(t2) - operator()(t1); } |
| |
| private: |
| bool approx; |
| int n; |
| double sqrt_n; |
| double n_n; |
| }; |
| |
| inline double cdf(const kolmogorov_smirnov_probability& dist, double val) |
| { |
| return dist.cdf(val); |
| } |
| |
| inline double quantile(const kolmogorov_smirnov_probability& dist, double val) |
| { |
| return invert_monotone_inc(boost::bind(&cdf, dist, _1), val, 0.0, 1000.0); |
| } |
| |
| /* |
| * Experiments for generators with continuous distribution functions |
| */ |
| class kolmogorov_experiment |
| { |
| public: |
| kolmogorov_experiment(int n) : n(n), ksp(n) { } |
| template<class NumberGenerator, class Distribution> |
| double run(NumberGenerator & gen, Distribution distrib) const |
| { |
| const int m = n; |
| typedef std::vector<double> saved_temp; |
| saved_temp a(m,1.0), b(m,0); |
| std::vector<int> c(m,0); |
| for(int i = 0; i < n; ++i) { |
| double val = static_cast<double>(gen()); |
| double y = cdf(distrib, val); |
| int k = static_cast<int>(std::floor(m*y)); |
| if(k >= m) |
| --k; // should not happen |
| a[k] = (std::min)(a[k], y); |
| b[k] = (std::max)(b[k], y); |
| ++c[k]; |
| } |
| double kplus = 0, kminus = 0; |
| int j = 0; |
| for(int k = 0; k < m; ++k) { |
| if(c[k] > 0) { |
| kminus = (std::max)(kminus, a[k]-j/static_cast<double>(n)); |
| j += c[k]; |
| kplus = (std::max)(kplus, j/static_cast<double>(n) - b[k]); |
| } |
| } |
| kplus *= std::sqrt(double(n)); |
| kminus *= std::sqrt(double(n)); |
| // std::cout << "k+ " << kplus << " k- " << kminus << std::endl; |
| return kplus; |
| } |
| double probability(double x) const |
| { |
| return cdf(ksp, x); |
| } |
| private: |
| int n; |
| kolmogorov_smirnov_probability ksp; |
| }; |
| |
| struct power_distribution |
| { |
| power_distribution(double t) : t(t) {} |
| double t; |
| }; |
| |
| double cdf(const power_distribution& dist, double val) |
| { |
| return std::pow(val, dist.t); |
| } |
| |
| // maximum-of-t test (KS-based) |
| template<class UniformRandomNumberGenerator> |
| class maximum_experiment |
| { |
| public: |
| typedef UniformRandomNumberGenerator base_type; |
| maximum_experiment(base_type & f, int n, int t) : f(f), ke(n), t(t) |
| { } |
| |
| double operator()() const |
| { |
| generator gen(f, t); |
| return ke.run(gen, power_distribution(t)); |
| } |
| |
| private: |
| struct generator { |
| generator(base_type & f, int t) : f(f, boost::uniform_01<>()), t(t) { } |
| double operator()() |
| { |
| double mx = f(); |
| for(int i = 1; i < t; ++i) |
| mx = (std::max)(mx, f()); |
| return mx; |
| } |
| private: |
| boost::variate_generator<base_type&, boost::uniform_01<> > f; |
| int t; |
| }; |
| base_type & f; |
| kolmogorov_experiment ke; |
| int t; |
| }; |
| |
| // compute a chi-square value for the distribution approximation error |
| template<class ForwardIterator, class UnaryFunction> |
| typename UnaryFunction::result_type |
| chi_square_value(ForwardIterator first, ForwardIterator last, |
| UnaryFunction probability) |
| { |
| typedef std::iterator_traits<ForwardIterator> iter_traits; |
| typedef typename iter_traits::value_type counter_type; |
| typedef typename UnaryFunction::result_type result_type; |
| unsigned int classes = std::distance(first, last); |
| result_type sum = 0; |
| counter_type n = 0; |
| for(unsigned int i = 0; i < classes; ++first, ++i) { |
| counter_type count = *first; |
| n += count; |
| sum += (count/probability(i)) * count; // avoid overflow |
| } |
| #if 0 |
| for(unsigned int i = 0; i < classes; ++i) { |
| // std::cout << (n*probability(i)) << " "; |
| if(n * probability(i) < 5) |
| std::cerr << "Not enough test runs for slot " << i |
| << " p=" << probability(i) << ", n=" << n |
| << std::endl; |
| } |
| #endif |
| // std::cout << std::endl; |
| // throw std::invalid_argument("not enough test runs"); |
| |
| return sum/n - n; |
| } |
| template<class RandomAccessContainer> |
| class generic_counter |
| { |
| public: |
| explicit generic_counter(unsigned int classes) : container(classes, 0) { } |
| void operator()(int i) |
| { |
| assert(i >= 0); |
| assert(static_cast<unsigned int>(i) < container.size()); |
| ++container[i]; |
| } |
| typename RandomAccessContainer::const_iterator begin() const |
| { return container.begin(); } |
| typename RandomAccessContainer::const_iterator end() const |
| { return container.end(); } |
| |
| private: |
| RandomAccessContainer container; |
| }; |
| |
| // chi_square test |
| template<class Experiment, class Generator> |
| double run_experiment(const Experiment & experiment, Generator & gen, int n) |
| { |
| generic_counter<std::vector<int> > v(experiment.classes()); |
| experiment.run(gen, v, n); |
| return chi_square_value(v.begin(), v.end(), |
| std::bind1st(std::mem_fun_ref(&Experiment::probability), |
| experiment)); |
| } |
| |
| // chi_square test |
| template<class Experiment, class Generator> |
| double run_experiment(const Experiment & experiment, const Generator & gen, int n) |
| { |
| generic_counter<std::vector<int> > v(experiment.classes()); |
| experiment.run(gen, v, n); |
| return chi_square_value(v.begin(), v.end(), |
| std::bind1st(std::mem_fun_ref(&Experiment::probability), |
| experiment)); |
| } |
| |
| // number generator with experiment results (for nesting) |
| template<class Experiment, class Generator> |
| class experiment_generator_t |
| { |
| public: |
| experiment_generator_t(const Experiment & exper, Generator & gen, int n) |
| : experiment(exper), generator(gen), n(n) { } |
| double operator()() const { return run_experiment(experiment, generator, n); } |
| private: |
| const Experiment & experiment; |
| Generator & generator; |
| int n; |
| }; |
| |
| template<class Experiment, class Generator> |
| experiment_generator_t<Experiment, Generator> |
| experiment_generator(const Experiment & e, Generator & gen, int n) |
| { |
| return experiment_generator_t<Experiment, Generator>(e, gen, n); |
| } |
| |
| |
| template<class Experiment, class Generator, class Distribution> |
| class ks_experiment_generator_t |
| { |
| public: |
| ks_experiment_generator_t(const Experiment & exper, Generator & gen, |
| const Distribution & distrib) |
| : experiment(exper), generator(gen), distribution(distrib) { } |
| double operator()() const { return experiment.run(generator, distribution); } |
| private: |
| const Experiment & experiment; |
| Generator & generator; |
| Distribution distribution; |
| }; |
| |
| template<class Experiment, class Generator, class Distribution> |
| ks_experiment_generator_t<Experiment, Generator, Distribution> |
| ks_experiment_generator(const Experiment & e, Generator & gen, |
| const Distribution & distrib) |
| { |
| return ks_experiment_generator_t<Experiment, Generator, Distribution> |
| (e, gen, distrib); |
| } |
| |
| |
| #endif /* STATISTIC_TESTS_HPP */ |
| |