source: trunk/test/statistics.cc @ 4053

Last change on this file since 4053 was 4053, checked in by Peter, 8 months ago

add some test for correlation(Matrix); speed up, particuarly when #rows is much greater than #columns. Avoid copying and modifying the whole matrix, instead calculate simple sums and squared sums using BLAS functionality as much as possible.

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 13.4 KB
Line 
1// $Id: statistics.cc 4053 2021-03-26 03:29:42Z peter $
2
3/*
4  Copyright (C) 2004 Jari Häkkinen, Peter Johansson
5  Copyright (C) 2005 Peter Johansson
6  Copyright (C) 2006 Jari Häkkinen, Peter Johansson, Markus Ringnér
7  Copyright (C) 2007, 2008, 2009 Jari Häkkinen, Peter Johansson
8  Copyright (C) 2010, 2011, 2012, 2013, 2014, 2016, 2018, 2020 Peter Johansson
9
10  This file is part of the yat library, http://dev.thep.lu.se/yat
11
12  The yat library is free software; you can redistribute it and/or
13  modify it under the terms of the GNU General Public License as
14  published by the Free Software Foundation; either version 3 of the
15  License, or (at your option) any later version.
16
17  The yat library is distributed in the hope that it will be useful,
18  but WITHOUT ANY WARRANTY; without even the implied warranty of
19  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20  General Public License for more details.
21
22  You should have received a copy of the GNU General Public License
23  along with yat. If not, see <http://www.gnu.org/licenses/>.
24*/
25
26#include <config.h>
27
28#include "Suite.h"
29
30#include "yat/classifier/Target.h"
31#include "yat/statistics/Average.h"
32#include "yat/statistics/utility.h"
33#include "yat/statistics/tTest.h"
34#include "yat/utility/DataWeight.h"
35#include "yat/utility/Matrix.h"
36#include "yat/utility/MatrixWeighted.h"
37#include "yat/utility/Vector.h"
38
39#include <boost/concept_archetype.hpp>
40#include <boost/iterator/iterator_archetypes.hpp>
41
42#include <cmath>
43#include <cstdlib>
44#include <iostream>
45#include <limits>
46#include <map>
47#include <vector>
48
49using namespace theplu::yat;
50void test_benjamini_hochberg(test::Suite&);
51void test_benjamini_hochberg_unsorted(test::Suite&);
52void test_correlation(test::Suite& suite);
53void test_entropy(test::Suite&);
54void test_mad(test::Suite&);
55void test_mutual_information(test::Suite&);
56
57void test_median_empty(test::Suite&);
58void test_percentiler(test::Suite&);
59void test_percentiler_nan(test::Suite&);
60
61template<typename RandomAccessIterator>
62void test_percentiler(test::Suite&, RandomAccessIterator,
63                      RandomAccessIterator,
64                      double p, double correct);
65
66template<typename RandomAccessIterator1, typename RandomAccessIterator2>
67void cmp_percentiler(test::Suite&,
68                     RandomAccessIterator1,
69                     RandomAccessIterator1,
70                     RandomAccessIterator2,
71                     RandomAccessIterator2);
72
73int main(int argc, char* argv[])
74{
75  test::Suite suite(argc, argv);
76
77  utility::Vector gsl_vec(10);
78  std::vector<double> data;
79  for (unsigned int i=0; i<10; i++){
80    data.push_back(static_cast<double>(i));
81    gsl_vec(i)=i;
82  }
83
84  double m=statistics::median(data.begin(), data.end());
85  double m_gsl=statistics::median(gsl_vec.begin(), gsl_vec.end());
86  if (m!=4.5 || m!=m_gsl)
87    suite.add(false);
88  if (false) {
89    using statistics::median;
90    typedef boost::iterator_archetypes::readable_lvalue_iterator_t Access;
91    typedef boost::random_access_traversal_tag Traversal;
92    boost::iterator_archetype<double, Access, Traversal> input;
93    double x = median(input, input);
94    boost::iterator_archetype<utility::DataWeight, Access, Traversal> input2;
95    x = median(input2, input2);
96    test::avoid_compiler_warning(x);
97  }
98  statistics::percentile2(data.begin(), data.end(), 100);
99  data.resize(1);
100  statistics::median(data.begin(), data.end());
101  // testing percentile2
102  test_percentiler(suite);
103
104  // test weighted percentiler with NaNs
105  test_percentiler_nan(suite);
106
107  double skewness_gsl=statistics::skewness(gsl_vec);
108  if (!suite.equal(1-skewness_gsl, 1.0) )
109    suite.add(false);
110  double kurtosis_gsl=statistics::kurtosis(gsl_vec);
111  suite.add(suite.equal_fix(kurtosis_gsl,-1.5616363636363637113,1e-10));
112  statistics::Average func;
113  suite.add(suite.equal(func(gsl_vec.begin(), gsl_vec.end()),4.5));
114  // easiest way to get a weighted iterator
115  classifier::MatrixLookupWeighted mlw(10,20,2.0, 1.0);
116  suite.add(suite.equal(func(mlw.begin(), mlw.end()),2.0));
117  // do not run compiler test
118  if (false) {
119    statistics::Average average;
120    double x = average(boost::input_iterator_archetype<double>(),
121                       boost::input_iterator_archetype<double>());
122    test::avoid_compiler_warning(x);
123    using utility::DataWeight;
124    x = average(boost::input_iterator_archetype_no_proxy<DataWeight>(),
125                boost::input_iterator_archetype_no_proxy<DataWeight>());
126    test::avoid_compiler_warning(x);
127  }
128
129  test_mad(suite);
130
131  // do not run compiler test
132  if (false) {
133    statistics::tTest t_test;
134    classifier::Target target;
135    add(t_test, boost::forward_iterator_archetype<double>(),
136        boost::forward_iterator_archetype<double>(), target);
137    add(t_test, boost::forward_iterator_archetype<utility::DataWeight>(),
138        boost::forward_iterator_archetype<utility::DataWeight>(), target);
139  }
140  test_benjamini_hochberg(suite);
141  test_benjamini_hochberg_unsorted(suite);
142  test_entropy(suite);
143  test_median_empty(suite);
144  test_mutual_information(suite);
145  test_correlation(suite);
146  return suite.return_value();
147}
148
149
150void test_correlation(test::Suite& suite)
151{
152  utility::Matrix x(100,10);
153  for (size_t i=0; i<x.rows(); ++i)
154    for (size_t j=0; j<x.columns(); ++j)
155      x(i, j) = sin(5*i + j);
156  utility::Matrix C = statistics::correlation(x);
157  if (!suite.equal(C(0,0), 1.0)) {
158    suite.add(false);
159    suite.err() << "correlation element(0, 0) not unity\n";
160  }
161
162  for (size_t i=0; i<C.rows(); ++i)
163    for (size_t j=0; j<C.columns(); ++j) {
164      statistics::AveragerPair ap;
165      add(ap, x.begin_column(i), x.end_column(i), x.begin_column(j));
166      if (!suite.equal(C(i, j), ap.correlation(), 10)) {
167        suite.add(false);
168        suite.err() << "error: C(" << i << ", " << j << ") is " << C(i,j)
169                    << "; expected " << ap.correlation() << "\n";
170      }
171    }
172
173}
174
175
176void test_benjamini_hochberg(test::Suite& suite)
177{
178  std::vector<double> p;
179  p.push_back(0.0001);
180  p.push_back(0.01);
181  p.push_back(0.015);
182  p.push_back(0.5);
183  p.push_back(0.99);
184  std::vector<double> q(p.size());
185  statistics::benjamini_hochberg(p.begin(), p.end(), q.begin());
186  suite.add(suite.equal(q[0], p[0]*5));
187  suite.add(suite.equal(q[1], p[1]*2.5));
188  suite.add(suite.equal(q[2], 0.025));
189  suite.add(suite.equal(q[3], p[3]*1.25));
190  suite.add(suite.equal(q[4], 0.99));
191
192  // do nut run compiler test
193  if (false) {
194    using statistics::benjamini_hochberg;
195
196    typedef double Value;
197    typedef boost::iterator_archetypes::readable_iterator_t Access;
198    typedef boost::bidirectional_traversal_tag Traversal;
199    typedef boost::iterator_archetypes::readable_writable_iterator_t Access2;
200    typedef boost::bidirectional_traversal_tag Traversal2;
201
202    boost::iterator_archetype<Value, Access, Traversal> iterator;
203    boost::iterator_archetype<Value, Access2, Traversal2> iterator2;
204
205    benjamini_hochberg(iterator, iterator, iterator2);
206  }
207}
208
209
210void test_benjamini_hochberg_unsorted(test::Suite& suite)
211{
212  std::vector<double> p;
213  p.push_back(0.015);
214  p.push_back(0.0001);
215  p.push_back(0.01);
216  p.push_back(0.5);
217  p.push_back(0.99);
218  std::vector<double> q(p.size());
219  statistics::benjamini_hochberg_unsorted(p.begin(), p.end(), q.begin());
220  suite.add(suite.equal(q[1], p[1]*5));
221  suite.add(suite.equal(q[2], p[2]*2.5));
222  suite.add(suite.equal(q[0], 0.025));
223  suite.add(suite.equal(q[3], p[3]*1.25));
224  suite.add(suite.equal(q[4], 0.99));
225
226  // do nut run compiler test
227  if (false) {
228    using statistics::benjamini_hochberg_unsorted;
229
230    typedef double Value;
231    typedef boost::iterator_archetypes::readable_iterator_t Access;
232    typedef boost::random_access_traversal_tag Traversal;
233    typedef boost::iterator_archetypes::readable_writable_iterator_t Access2;
234    typedef boost::random_access_traversal_tag Traversal2;
235
236    boost::iterator_archetype<Value, Access, Traversal> input;
237    boost::iterator_archetype<Value, Access2, Traversal2> result;
238
239    benjamini_hochberg_unsorted(input, input, result);
240  }
241}
242
243
244void test_entropy(test::Suite& suite)
245{
246  suite.out() << "testing entropy(2)\n";
247  using statistics::entropy;
248  std::vector<int> x(10000,0);
249  x[512] = 42;
250  double e = entropy(x.begin(), x.end());
251  if (e>1e-15) {
252    suite.add(false);
253    suite.out() << "entropy: " << e << " expected close to 0\n";
254  }
255  x[0] = 42;
256  e = entropy(x.begin(), x.end());
257  if (e<=0) {
258    suite.add(false);
259    suite.out() << "entropy: " << e << " expected > 0\n";
260  }
261
262  // do not run compiler test
263  if (false) {
264    entropy(boost::input_iterator_archetype<double>(),
265            boost::input_iterator_archetype<double>());
266  }
267}
268
269
270void test_mad(test::Suite& suite)
271{
272  suite.err() << "testing mad" << std::endl;
273  utility::Vector x(3);
274  x(0) = 3;
275  x(1) = 1;
276  x(2) = 100;
277  suite.add(suite.equal(statistics::mad(x.begin(), x.end()), 2));
278
279  std::vector<utility::DataWeight> wx(3);
280  wx[0] = utility::DataWeight(3, 0.4);
281  wx[1] = utility::DataWeight(1, 0.4);
282  wx[2] = utility::DataWeight(100, 0.6);
283  suite.add(suite.equal(statistics::mad(wx.begin(), wx.end()), 2));
284  // do not run compiler test
285  if (false) {
286    using statistics::mad;
287    typedef boost::iterator_archetypes::readable_lvalue_iterator_t Access;
288    typedef boost::random_access_traversal_tag Traversal;
289    boost::iterator_archetype<double, Access, Traversal> input;
290    double x = mad(input, input);
291    typedef boost::iterator_archetypes::readable_iterator_t Access2;
292    boost::iterator_archetype<utility::DataWeight, Access2, Traversal> input2;
293    x = mad(input2, input2);
294    test::avoid_compiler_warning(x);
295  }
296}
297
298
299void test_mutual_information(test::Suite& suite)
300{
301  suite.out() << "testing mutual_information\n";
302  using statistics::mutual_information;
303  utility::Matrix x(2,2);
304  x(0,0) = 100;
305  x(1,1) = 100;
306  double mi = mutual_information(x);
307  if (!suite.add(suite.equal(mi,1.0,100))) {
308    suite.err() << "error: mutual information: " << mi << "\n";
309  }
310
311  // testing a non-square Matrix
312  x.resize(3,4,0);
313  x(0,0) = 1;
314  x(1,1) = 1;
315  x(2,2) = 1;
316  x(2,3) = 1;
317  mi = mutual_information(x);
318  suite.out() << "mi: " << mi << "\n";
319}
320
321
322// test for ticket #660
323void test_median_empty(test::Suite& suite)
324{
325  std::vector<double> x;
326  double m = 0;
327  m = statistics::median(x.begin(), x.end(), true);
328  test::avoid_compiler_warning(m);
329}
330
331
332void test_percentiler(test::Suite& suite)
333{
334  suite.err() << "testing unweighted percentile2" << std::endl;
335  std::vector<double> x;
336  x.reserve(6);
337  for (unsigned int i=0; i<5; i++){
338    x.push_back(static_cast<double>(i+1));
339  }
340  test_percentiler(suite, x.begin(), x.end(), 50, 3);
341  x.push_back(6);
342  test_percentiler(suite, x.begin(), x.end(), 50, 3.5);
343  test_percentiler(suite, x.begin(), x.end(), 25, 2);
344  test_percentiler(suite, x.begin(), x.end(), 0, 1);
345  test_percentiler(suite, x.begin(), x.end(), 10, 1);
346
347  suite.err() << "testing duplication of data\n";
348  std::vector<double> x2(x);
349  for (size_t i=0; i<x.size(); ++i)
350    x2.push_back(x[i]);
351  cmp_percentiler(suite, x.begin(), x.end(), x2.begin(), x2.end());
352
353
354  // testing weighted
355
356  suite.err() << "testing weighted percentile2" << std::endl;
357  std::vector<utility::DataWeight> xw(x.size());
358  for (size_t i=0; i<xw.size(); ++i) {
359    xw[i].data() = x[i];
360    xw[i].weight() = 1.0;
361  }
362  const std::vector<utility::DataWeight> xw_orig(xw);
363  suite.err() << "testing weighted" << std::endl;
364  test_percentiler(suite, xw.begin(), xw.end(), 0, 1);
365  test_percentiler(suite, xw.begin(), xw.end(), 100, 6);
366  test_percentiler(suite, xw.begin(), xw.end(), 49, 3);
367  test_percentiler(suite, xw.begin(), xw.end(), 51, 4);
368  test_percentiler(suite, xw.begin(), xw.end(), 50, 3.5);
369  test_percentiler(suite, x.begin(), x.end(), 10, 1);
370
371  suite.err() << "testing weighted with unity weights" << std::endl;
372  cmp_percentiler(suite, x.begin(), x.end(), xw.begin(), xw.end());
373
374  suite.err() << "testing that w=0 equals removed data point\n";
375  xw=xw_orig;
376  std::vector<utility::DataWeight> xw2(xw_orig);
377  xw[3].weight() = 0.0;
378  xw2.erase(xw2.begin()+3);
379  cmp_percentiler(suite, xw.begin(), xw.end(), xw2.begin(), xw2.end());
380
381  suite.err() << "testing rescaling of weights\n";
382  xw2 = xw;
383  for (size_t i=0; i<xw2.size(); ++i)
384    xw2[i].weight()*=2;
385  cmp_percentiler(suite, xw.begin(), xw.end(), xw2.begin(), xw2.end());
386
387  // do not run compiler test
388  if (false) {
389    statistics::Percentiler percentiler(50);
390    typedef boost::iterator_archetypes::readable_lvalue_iterator_t Access;
391    typedef boost::random_access_traversal_tag Traversal;
392    boost::iterator_archetype<double, Access, Traversal> input;
393    double x = percentiler(input, input);
394    boost::iterator_archetype<utility::DataWeight, Access, Traversal> input2;
395    x = percentiler(input2, input2);
396    test::avoid_compiler_warning(x);
397  }
398}
399
400void test_percentiler_nan(test::Suite& suite)
401{
402  using utility::DataWeight;
403  std::vector<double> v;
404  v.push_back(1);
405  v.push_back(10);
406  v.push_back(4);
407  v.push_back(2);
408  std::vector<DataWeight> wv(5);
409  wv[0] = DataWeight(v[0]);
410  wv[1] = DataWeight(v[1]);
411  wv[2] = DataWeight(std::numeric_limits<double>::quiet_NaN(), 0.0);
412  wv[3] = DataWeight(v[2]);
413  wv[4] = DataWeight(v[3]);
414
415  cmp_percentiler(suite, v.begin(), v.end(), wv.begin(), wv.end());
416}
417
418template<typename RandomAccessIterator>
419void test_percentiler(test::Suite& suite, 
420                      RandomAccessIterator first, 
421                      RandomAccessIterator last,
422                      double p, double correct)
423{
424  using statistics::percentile2;
425  double x = percentile2(first, last, p);
426  if (!suite.add(suite.equal(x, correct, 10))) {
427    suite.err() << "Error in percentile2 for " << p << "th percentile \n";
428    suite.err() << "  calculated value: " << x << "\n";
429    suite.err() << "  expected value: " << correct << "\n";
430  }
431}
432
433template<typename RandomAccessIterator1, typename RandomAccessIterator2>
434void cmp_percentiler(test::Suite& suite, 
435                     RandomAccessIterator1 first1, 
436                     RandomAccessIterator1 last1,
437                     RandomAccessIterator2 first2,
438                     RandomAccessIterator2 last2)
439{
440  for (double p=0; p<=100; p+=10) {
441    double correct=statistics::percentile2(first1, last1, p);
442    test_percentiler(suite, first2, last2, p, correct);
443  }
444
445}
Note: See TracBrowser for help on using the repository browser.