source: trunk/yat/statistics/utility.h @ 4053

Last change on this file since 4053 was 4053, checked in by Peter, 8 months ago

add some test for correlation(Matrix); speed up, particuarly when #rows is much greater than #columns. Avoid copying and modifying the whole matrix, instead calculate simple sums and squared sums using BLAS functionality as much as possible.

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 17.0 KB
Line 
1#ifndef _theplu_yat_statistics_utility_
2#define _theplu_yat_statistics_utility_
3
4// $Id: utility.h 4053 2021-03-26 03:29:42Z peter $
5
6/*
7  Copyright (C) 2004 Jari Häkkinen, Peter Johansson
8  Copyright (C) 2005 Peter Johansson
9  Copyright (C) 2006 Jari Häkkinen, Peter Johansson, Markus Ringnér
10  Copyright (C) 2007, 2008 Jari Häkkinen, Peter Johansson
11  Copyright (C) 2009, 2010, 2011, 2013, 2014, 2016, 2018, 2020 Peter Johansson
12
13  This file is part of the yat library, http://dev.thep.lu.se/yat
14
15  The yat library is free software; you can redistribute it and/or
16  modify it under the terms of the GNU General Public License as
17  published by the Free Software Foundation; either version 3 of the
18  License, or (at your option) any later version.
19
20  The yat library is distributed in the hope that it will be useful,
21  but WITHOUT ANY WARRANTY; without even the implied warranty of
22  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
23  General Public License for more details.
24
25  You should have received a copy of the GNU General Public License
26  along with yat. If not, see <http://www.gnu.org/licenses/>.
27*/
28
29#include "Percentiler.h"
30
31#include "yat/classifier/DataLookupWeighted1D.h"
32#include "yat/classifier/Target.h"
33#include "yat/normalizer/utility.h"
34#include "yat/utility/concept_check.h"
35#include "yat/utility/DataIterator.h"
36#include "yat/utility/deprecate.h"
37#include "yat/utility/iterator_traits.h"
38#include "yat/utility/Matrix.h"
39#include "yat/utility/sort_index.h"
40#include "yat/utility/Vector.h"
41#include "yat/utility/VectorBase.h"
42#include "yat/utility/yat_assert.h"
43
44#include <boost/concept_check.hpp>
45#include <boost/iterator/iterator_concepts.hpp>
46#include <boost/iterator/iterator_traits.hpp>
47#include <boost/iterator/permutation_iterator.hpp>
48
49#include <gsl/gsl_math.h>
50#include <gsl/gsl_statistics_double.h>
51
52#include <algorithm>
53#include <cmath>
54#include <iterator>
55#include <stdexcept>
56#include <vector>
57
58namespace theplu {
59namespace yat {
60namespace statistics {
61
62  /**
63     Adding a range [\a first, \a last) into an object of type T. The
64     requirements for the type T is to have an add(double, bool, double)
65     function.
66
67     Type Requirements:
68     - \c ForwardIterator is a \readable_iterator
69     - \c ForwardIterator is \single_pass_iterator
70  */
71  template <typename T, typename ForwardIterator>
72  void add(T& o, ForwardIterator first, ForwardIterator last,
73           const classifier::Target& target);
74
75  /**
76     \brief Benjamini Hochberg multiple test correction
77
78     Given a sorted range of p-values such that
79     \f$ p_1 \le p_2 \le ... \le p_N \f$ a Benjamnini-Hochberg corrected
80     p-value, \c q, is calculated recursively as
81     \f$ q_i = \f$ min \f$(p_i \frac{N}{i}, q_{i+1})\f$ with the anchor
82     constraint that \f$ q_m = p_m \f$.
83
84     Type Requirements:
85     - \c BidirectionalIterator1 is a \readable_iterator
86     - \c BidirectionalIterator1 is a \bidirectional_traversal_iterator
87     - \c BidirectionalIterator1::value type is convertible to \c double
88     - \c BidirectionalIterator2 is a \readable_iterator
89     - \c BidirectionalIterator2 is a \writable_iterator
90     - \c BidirectionalIterator2 is a \bidirectional_traversal_iterator
91     - \c double is convertible to \c BidirectionalIterator2::value_type
92
93     \since New in yat 0.8
94   */
95  template<typename BidirectionalIterator1, typename BidirectionalIterator2>
96  void benjamini_hochberg(BidirectionalIterator1 first,
97                          BidirectionalIterator1 last,
98                          BidirectionalIterator2 result);
99
100
101  /**
102     \brief Benjamini Hochberg multiple test correction
103
104     Similar to benjamini_hochberg() but does not assume that input
105     range, [first, last), is sorted. The resulting range is the same
106     as if sorting input range, call benjamini_hochberg, and unsort
107     the result range.
108
109     Type Requirements:
110     - \c RandomAccessIterator is a \readable_iterator
111     - \c RandomAccessIterator is a \random_access_traversal_iterator
112     - \c RandomAccessIterator::value type is convertible to \c double
113     - \c MutableRandomAccessIterator is a \readable_iterator
114     - \c MutableRandomAccessIterator is a \writable_iterator
115     - \c MutableRandomAccessIterator is a \random_access_traversal_iterator
116     - \c double is convertible to \c MutableRandomAccessIterator::value_type
117
118     \since New in yat 0.13
119   */
120  template<typename RandomAccessIterator, typename MutableRandomAccessIterator>
121  void benjamini_hochberg_unsorted(RandomAccessIterator first,
122                                   RandomAccessIterator last,
123                                   MutableRandomAccessIterator result);
124
125
126  ///
127  /// Calculates the probability to get \a k or smaller from a
128  /// hypergeometric distribution with parameters \a n1 \a n2 \a
129  /// t. Hypergeomtric situation you get in the following situation:
130  /// Let there be \a n1 ways for a "good" selection and \a n2 ways
131  /// for a "bad" selection out of a total of possibilities. Take \a
132  /// t samples without replacement and \a k of those are "good"
133  /// samples. \a k will follow a hypergeomtric distribution.
134  ///
135  /// @return cumulative hypergeomtric distribution functions P(k).
136  ///
137  double cdf_hypergeometric_P(unsigned int k, unsigned int n1,
138                              unsigned int n2, unsigned int t);
139
140  /**
141     Calculates the Pearson correlation between each pair of columns.
142
143     \return a symmetric matrix with correlations
144
145     \since New in yat 0.16
146   */
147  utility::Matrix correlation(const utility::Matrix& x);
148
149
150  /**
151     The entropy is calculated as \f$ - \sum_i p_i \log p_i \f$ where
152     \f$p_i = \frac{n_i}{\sum_j n_j} \f$
153
154     Requirements:
155     - \c InputIterator must be a \readable_iterator
156     - \c InputIterator must be a \single_pass_iterator
157     - \c InputIterator::value_type must be convertible to \c double
158
159     \since New in yat 0.12
160   */
161  template<typename InputIterator>
162  double entropy(InputIterator first, InputIterator last);
163
164  /**
165     \brief one-sided p-value
166
167     This function uses the t-distribution to calculate the one-sided
168     p-value. Given that the true correlation is zero (Null
169     hypothesis) the estimated correlation, r, after a transformation
170     is t-distributed:
171
172     \f$ \sqrt{(n-2)} \frac{r}{\sqrt{(1-r^2)}} \in t(n-2) \f$
173
174     \return Probability that correlation is larger than \a r by
175     chance when having \a n samples. For \a r larger or equal to 1.0,
176     0.0 is returned. For \a r smaller or equal to -1.0, 1.0 is
177     returned.
178   */
179  double pearson_p_value(double r, unsigned int n);
180
181  ///
182  /// @brief Computes the kurtosis of the data in a vector.
183  ///
184  /// The kurtosis measures how sharply peaked a distribution is,
185  /// relative to its width. The kurtosis is normalized to zero for a
186  /// gaussian distribution.
187  ///
188  double kurtosis(const utility::VectorBase&);
189
190
191  /**
192     \brief Median absolute deviation from median
193
194     Function is non-mutable function
195
196     Type Requirements:
197     - \c RandomAccessIterator must be a \ref
198     concept_data_iterator
199     - \c RandomAccessIterator must be a \random_access_traversal_iterator
200     - \c For unweighted iterator, RandomAccessIterator must be an
201       \lvalue_iterator as implied by \forward_iterator.
202
203     Since 0.6 function also work with a \ref
204     concept_weighted_iterator
205  */
206  template <class RandomAccessIterator>
207  double mad(RandomAccessIterator first, RandomAccessIterator last,
208             bool sorted=false);
209
210
211  /**
212     Median is defined to be value in the middle. If number of values
213     is even median is the average of the two middle values.  the
214     median value is given by p equal to 50. If \a sorted is false
215     (default), the range is copied, the copy is sorted, and then
216     used to calculate the median.
217
218     Function is a non-mutable function, i.e., \a first and \a last
219     can be const_iterators.
220
221     Type Requirements:
222     - \c RandomAccessIterator must be a \ref
223     concept_data_iterator
224     - \c RandomAccessIterator must be a \random_access_traversal_iterator
225     - \c For unweighted iterator, RandomAccessIterator must be an
226       \lvalue_iterator as implied by \forward_iterator.
227
228     \return median of range
229  */
230  template <class RandomAccessIterator>
231  double median(RandomAccessIterator first, RandomAccessIterator last,
232                bool sorted=false);
233
234
235  /**
236     \brief Calculates the mutual information of \a A.
237
238     The elements in A are unnormalized probabilies of the joint
239     distribution.
240
241     The mutual information is calculated as \f$ \sum \sum p(x,y) \log_2
242     \frac {p(x,y)} {p(x)p(y)} \f$ where
243     \f$ p(x,y) = \frac {A_{xy}}{\sum_{x,y} A_{xy}} \f$;
244     \f$ p(x) = \sum_y A_{xy} / \sum_{x,y} A_{xy} \f$;
245     \f$ p(y) = \sum_x A_{xy} / \sum_{x,y} A_{xy} \f$
246
247     Requirements:
248     - \c T must be a model of \ref concept_container_2d
249     - \c T::value_type must be convertible to \c double
250
251     \return mutual information in bits; if you want in natural base
252     multiply with \c M_LN2 (defined in \c gsl/gsl_math.h )
253
254     \since New in yat 0.12
255   */
256  template<class T>
257  double mutual_information(const T& A);
258
259  /**
260     The percentile is determined by the \a p, a number between 0 and
261     100. The percentile is found by interpolation, using the formula
262     \f$ percentile = (1 - \delta) x_i + \delta x_{i+1} \f$ where \a
263     p is floor\f$((n - 1)p/100)\f$ and \f$ \delta \f$ is \f$
264     (n-1)p/100 - i \f$.Thus the minimum value of the vector is given
265     by p equal to zero, the maximum is given by p equal to 100 and
266     the median value is given by p equal to 50. If @a sorted
267     is false (default), the vector is copied, the copy is sorted,
268     and then used to calculate the median.
269
270     Function is a non-mutable function, i.e., \a first and \a last
271     can be const_iterators.
272
273     Requirements: RandomAccessIterator is an iterator over a range of
274     doubles (or any type being convertable to double).
275
276     @return \a p'th percentile of range
277
278     \deprecated percentile2 will replace this function in the future
279
280     \note the definition of percentile used here is not identical to
281     that one used in percentile2 and Percentile. The difference is
282     smaller for large ranges.
283  */
284  template <class RandomAccessIterator>
285  double percentile(RandomAccessIterator first, RandomAccessIterator last,
286                    double p, bool sorted=false) YAT_DEPRECATE;
287
288  /**
289     \see Percentiler
290
291     Type Requirements:
292     - \c RandomAccessIterator must be a \ref
293     concept_data_iterator
294     - \c RandomAccessIterator must be a \random_access_traversal_iterator
295     - \c For unweighted iterator, RandomAccessIterator must be an
296       \lvalue_iterator as implied by \forward_iterator.
297
298     \since new in yat 0.5
299   */
300  template <class RandomAccessIterator>
301  double percentile2(RandomAccessIterator first, RandomAccessIterator last,
302                     double p, bool sorted=false)
303  {
304    Percentiler percentiler(p, sorted);
305    return percentiler(first, last);
306  }
307
308  ///
309  /// @brief Computes the skewness of the data in a vector.
310  ///
311  /// The skewness measures the asymmetry of the tails of a
312  /// distribution.
313  ///
314  double skewness(const utility::VectorBase&);
315
316
317  //////// template implementations ///////////
318
319  template <typename T, typename ForwardIterator>
320  void add(T& o, ForwardIterator first, ForwardIterator last,
321           const classifier::Target& target)
322  {
323    BOOST_CONCEPT_ASSERT((boost_concepts::ReadableIterator<ForwardIterator>));
324    BOOST_CONCEPT_ASSERT((boost_concepts::SinglePassIterator<ForwardIterator>));
325    utility::iterator_traits<ForwardIterator> traits;
326    for (size_t i=0; first!=last; ++i, ++first)
327      o.add(traits.data(first), target.binary(i), traits.weight(first));
328  }
329
330
331  template<typename BidirectionalIterator1, typename BidirectionalIterator2>
332  void benjamini_hochberg(BidirectionalIterator1 first,
333                          BidirectionalIterator1 last,
334                          BidirectionalIterator2 result)
335  {
336    BOOST_CONCEPT_ASSERT((boost_concepts::ReadableIterator<BidirectionalIterator1>));
337    BOOST_CONCEPT_ASSERT((boost_concepts::BidirectionalTraversal<BidirectionalIterator1>));
338
339    BOOST_CONCEPT_ASSERT((boost_concepts::ReadableIterator<BidirectionalIterator2>));
340    BOOST_CONCEPT_ASSERT((boost_concepts::WritableIterator<BidirectionalIterator2>));
341    BOOST_CONCEPT_ASSERT((boost_concepts::BidirectionalTraversal<BidirectionalIterator2>));
342
343    typedef typename boost::iterator_difference<BidirectionalIterator1>::type
344      diff_type;
345
346    diff_type n = std::distance(first, last);
347    if (!n)
348      return;
349    std::advance(result, n-1);
350    first = last;
351    --first;
352    diff_type rank = n;
353
354    double x = 1.0;
355    while (rank) {
356      x = std::min(*first * n/static_cast<double>(rank), x);
357      *result = x;
358      --rank;
359      --first;
360      --result;
361    }
362  }
363
364
365  template<typename RandomAccessIterator, typename MutableRandomAccessIterator>
366  void benjamini_hochberg_unsorted(RandomAccessIterator first,
367                                   RandomAccessIterator last,
368                                   MutableRandomAccessIterator result)
369  {
370    BOOST_CONCEPT_ASSERT((boost_concepts::ReadableIterator<RandomAccessIterator>));
371    BOOST_CONCEPT_ASSERT((boost_concepts::RandomAccessTraversal<RandomAccessIterator>));
372
373    BOOST_CONCEPT_ASSERT((boost_concepts::ReadableIterator<MutableRandomAccessIterator>));
374    BOOST_CONCEPT_ASSERT((boost_concepts::WritableIterator<MutableRandomAccessIterator>));
375    BOOST_CONCEPT_ASSERT((boost_concepts::RandomAccessTraversal<MutableRandomAccessIterator>));
376
377    std::vector<size_t> idx;
378    utility::sort_index(first, last, idx);
379    benjamini_hochberg(boost::make_permutation_iterator(first, idx.begin()),
380                       boost::make_permutation_iterator(first, idx.end()),
381                       boost::make_permutation_iterator(result, idx.begin()));
382  }
383
384
385  template<typename InputIterator>
386  double entropy(InputIterator first, InputIterator last)
387  {
388    BOOST_CONCEPT_ASSERT((boost_concepts::ReadableIterator<InputIterator>));
389    BOOST_CONCEPT_ASSERT((boost_concepts::SinglePassIterator<InputIterator>));
390    using boost::Convertible;
391    typedef typename boost::iterator_value<InputIterator>::type T;
392    BOOST_CONCEPT_ASSERT((Convertible<T,double>));
393    double sum = 0;
394    double N = 0;
395    for (; first != last; ++first) {
396      if (*first) {
397        N += *first;
398        sum += *first * std::log(static_cast<double>(*first));
399      }
400    }
401    return -sum / N + log(N);
402  }
403
404
405  template <class RandomAccessIterator>
406  double mad(RandomAccessIterator first, RandomAccessIterator last,
407             bool sorted)
408  {
409    BOOST_CONCEPT_ASSERT((utility::DataIteratorConcept<RandomAccessIterator>));
410    BOOST_CONCEPT_ASSERT((boost_concepts::RandomAccessTraversal<RandomAccessIterator>));
411    double m = median(first, last, sorted);
412    typedef typename boost::iterator_value<RandomAccessIterator>::type T;
413    std::vector<T> ad(std::distance(first, last));
414    // copy weights if needed
415    normalizer::detail::copy_weight_if_weighted(first, last, ad.begin());
416    // assign data (absolute deviation from median)
417    utility::iterator_traits<RandomAccessIterator> traits;
418    utility::DataIterator<typename std::vector<T>::iterator> first2(ad.begin());
419    while (first!=last) {
420      *first2 = std::abs(traits.data(first)-m);
421      ++first;
422      ++first2;
423    }
424    return median(ad.begin(), ad.end(), false);
425  }
426
427
428  template <class RandomAccessIterator>
429  double median(RandomAccessIterator first, RandomAccessIterator last,
430                bool sorted)
431  {
432    BOOST_CONCEPT_ASSERT((utility::DataIteratorConcept<RandomAccessIterator>));
433    BOOST_CONCEPT_ASSERT((boost_concepts::RandomAccessTraversal<RandomAccessIterator>));
434    return percentile2(first, last, 50.0, sorted);
435  }
436
437
438  template<class T>
439  double mutual_information(const T& n)
440  {
441    BOOST_CONCEPT_ASSERT((utility::Container2D<T>));
442    using boost::Convertible;
443    BOOST_CONCEPT_ASSERT((Convertible<typename T::value_type,double>));
444
445    // p_x = \sum_y p_xy
446
447    // Mutual Information is defined as
448    // \sum_xy p_xy * log (p_xy / (p_x p_y)) =
449    // \sum_xy p_xy * [log p_xy - log p_x - log p_y]
450    // \sum_xy p_xy log p_xy - p_xy log p_x - p_xy log p_y
451    // \sum_xy p_xy log p_xy - \sum_x p_x log p_x - \sum_y p_y log p_y
452    // - entropy_xy + entropy_x + entropy_y
453
454    utility::Vector rowsum(n.columns(), 0);
455    for (size_t c = 0; c<n.columns(); ++c)
456      rowsum(c) = std::accumulate(n.begin_column(c), n.end_column(c), 0);
457
458    utility::Vector colsum(n.rows(), 0);
459    for (size_t r = 0; r<n.rows(); ++r)
460      colsum(r) = std::accumulate(n.begin_row(r), n.end_row(r), 0);
461
462    double mi  = - entropy(n.begin(), n.end());
463    mi += entropy(rowsum.begin(), rowsum.end());
464    mi += entropy(colsum.begin(), colsum.end());
465
466    return mi/M_LN2;
467  }
468
469
470  template <class RandomAccessIterator>
471  double percentile(RandomAccessIterator first, RandomAccessIterator last,
472                    double p, bool sorted)
473  {
474    BOOST_CONCEPT_ASSERT((boost::RandomAccessIterator<RandomAccessIterator>));
475    BOOST_CONCEPT_ASSERT((utility::DataIteratorConcept<RandomAccessIterator>));
476    // range is one value only is a special case
477    if (first+1 == last)
478      return utility::iterator_traits<RandomAccessIterator>().data(first);
479    if (sorted) {
480      // have to take care of this special case
481      if (p>=100)
482        return utility::iterator_traits<RandomAccessIterator>().data(--last);
483      double j = p/100 * (std::distance(first,last)-1);
484      int i = static_cast<int>(j);
485      return (1-j+floor(j))*first[i] + (j-floor(j))*first[i+1];
486    }
487    using std::iterator_traits;
488    typedef typename iterator_traits<RandomAccessIterator>::value_type value_t;
489    std::vector<value_t> v_copy(first, last);
490    size_t i = static_cast<size_t>(p/100 * (v_copy.size()-1));
491    if (i+2 < v_copy.size()) {
492      std::partial_sort(v_copy.begin(), v_copy.begin()+i+2, v_copy.end());
493    }
494    else
495      std::sort(v_copy.begin(), v_copy.end());
496    return percentile(v_copy.begin(), v_copy.end(), p, true);
497  }
498
499
500}}} // of namespace statistics, yat, and theplu
501
502#endif
Note: See TracBrowser for help on using the repository browser.