source: trunk/yat/statistics/utility.h @ 3745

Last change on this file since 3745 was 3745, checked in by Peter, 4 years ago

add function calculating correlation matrix

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 16.7 KB
Line 
1#ifndef _theplu_yat_statistics_utility_
2#define _theplu_yat_statistics_utility_
3
4// $Id: utility.h 3745 2018-07-27 00:27:02Z peter $
5
6/*
7  Copyright (C) 2004 Jari Häkkinen, Peter Johansson
8  Copyright (C) 2005 Peter Johansson
9  Copyright (C) 2006 Jari Häkkinen, Peter Johansson, Markus Ringnér
10  Copyright (C) 2007, 2008 Jari Häkkinen, Peter Johansson
11  Copyright (C) 2009, 2010, 2011, 2013, 2014, 2016 Peter Johansson
12
13  This file is part of the yat library, http://dev.thep.lu.se/yat
14
15  The yat library is free software; you can redistribute it and/or
16  modify it under the terms of the GNU General Public License as
17  published by the Free Software Foundation; either version 3 of the
18  License, or (at your option) any later version.
19
20  The yat library is distributed in the hope that it will be useful,
21  but WITHOUT ANY WARRANTY; without even the implied warranty of
22  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
23  General Public License for more details.
24
25  You should have received a copy of the GNU General Public License
26  along with yat. If not, see <http://www.gnu.org/licenses/>.
27*/
28
29#include "Percentiler.h"
30
31#include "yat/classifier/DataLookupWeighted1D.h"
32#include "yat/classifier/Target.h"
33#include "yat/normalizer/utility.h"
34#include "yat/utility/concept_check.h"
35#include "yat/utility/DataIterator.h"
36#include "yat/utility/deprecate.h"
37#include "yat/utility/iterator_traits.h"
38#include "yat/utility/Matrix.h"
39#include "yat/utility/sort_index.h"
40#include "yat/utility/Vector.h"
41#include "yat/utility/VectorBase.h"
42#include "yat/utility/yat_assert.h"
43
44#include <boost/concept_check.hpp>
45#include <boost/iterator/iterator_concepts.hpp>
46#include <boost/iterator/iterator_traits.hpp>
47#include <boost/iterator/permutation_iterator.hpp>
48
49#include <gsl/gsl_math.h>
50#include <gsl/gsl_statistics_double.h>
51
52#include <algorithm>
53#include <cmath>
54#include <iterator>
55#include <stdexcept>
56#include <vector>
57
58namespace theplu {
59namespace yat {
60namespace statistics {
61
62  /**
63     Adding a range [\a first, \a last) into an object of type T. The
64     requirements for the type T is to have an add(double, bool, double)
65     function.
66
67     Type Requirements:
68     - \c ForwardIterator is a \readable_iterator
69     - \c ForwardIterator is \single_pass_iterator
70  */
71  template <typename T, typename ForwardIterator>
72  void add(T& o, ForwardIterator first, ForwardIterator last,
73           const classifier::Target& target);
74
75  /**
76     \brief Benjamini Hochberg multiple test correction
77
78     Given a sorted range of p-values such that
79     \f$ p_1 \le p_2 \le ... \le p_N \f$ a Benjamnini-Hochberg corrected
80     p-value, \c q, is calculated recursively as
81     \f$ q_i = \f$ min \f$(p_i \frac{N}{i}, q_{i+1})\f$ with the anchor
82     constraint that \f$ q_m = p_m \f$.
83
84     Type Requirements:
85     - \c BidirectionalIterator1 is a \readable_iterator
86     - \c BidirectionalIterator1 is a \bidirectional_traversal_iterator
87     - \c BidirectionalIterator1::value type is convertible to \c double
88     - \c BidirectionalIterator2 is a \readable_iterator
89     - \c BidirectionalIterator2 is a \writable_iterator
90     - \c BidirectionalIterator2 is a \bidirectional_traversal_iterator
91     - \c double is convertible to \c BidirectionalIterator2::value_type
92
93     \since New in yat 0.8
94   */
95  template<typename BidirectionalIterator1, typename BidirectionalIterator2>
96  void benjamini_hochberg(BidirectionalIterator1 first,
97                          BidirectionalIterator1 last,
98                          BidirectionalIterator2 result);
99
100
101  /**
102     \brief Benjamini Hochberg multiple test correction
103
104     Similar to benjamini_hochberg() but does not assume that input
105     range, [first, last), is sorted. The resulting range is the same
106     as if sorting input range, call benjamini_hochberg, and unsort
107     the result range.
108
109     Type Requirements:
110     - \c RandomAccessIterator is a \readable_iterator
111     - \c RandomAccessIterator is a \random_access_traversal_iterator
112     - \c RandomAccessIterator::value type is convertible to \c double
113     - \c MutableRandomAccessIterator is a \readable_iterator
114     - \c MutableRandomAccessIterator is a \writable_iterator
115     - \c MutableRandomAccessIterator is a \random_access_traversal_iterator
116     - \c double is convertible to \c MutableRandomAccessIterator::value_type
117
118     \since New in yat 0.13
119   */
120  template<typename RandomAccessIterator, typename MutableRandomAccessIterator>
121  void benjamini_hochberg_unsorted(RandomAccessIterator first,
122                                   RandomAccessIterator last,
123                                   MutableRandomAccessIterator result);
124
125
126  ///
127  /// Calculates the probability to get \a k or smaller from a
128  /// hypergeometric distribution with parameters \a n1 \a n2 \a
129  /// t. Hypergeomtric situation you get in the following situation:
130  /// Let there be \a n1 ways for a "good" selection and \a n2 ways
131  /// for a "bad" selection out of a total of possibilities. Take \a
132  /// t samples without replacement and \a k of those are "good"
133  /// samples. \a k will follow a hypergeomtric distribution.
134  ///
135  /// @return cumulative hypergeomtric distribution functions P(k).
136  ///
137  double cdf_hypergeometric_P(unsigned int k, unsigned int n1,
138                              unsigned int n2, unsigned int t);
139
140  /**
141     Calculates the Pearson correlation between each pair of columns.
142
143     \return a symmetric matrix with correlations
144
145     \since New in yat 0.16
146   */
147  utility::Matrix correlation(utility::Matrix x);
148
149
150  /**
151     The entropy is calculated as \f$ - \sum_i p_i \log p_i \f$ where
152     \f$p_i = \frac{n_i}{\sum_j n_j} \f$
153
154     Requirements:
155     - \c InputIterator must be a \readable_iterator
156     - \c InputIterator must be a \single_pass_iterator
157     - \c InputIterator::value_type must be convertible to \c double
158
159     \since New in yat 0.12
160   */
161  template<typename InputIterator>
162  double entropy(InputIterator first, InputIterator last);
163
164  /**
165     \brief one-sided p-value
166
167     This function uses the t-distribution to calculate the one-sided
168     p-value. Given that the true correlation is zero (Null
169     hypothesis) the estimated correlation, r, after a transformation
170     is t-distributed:
171
172     \f$ \sqrt{(n-2)} \frac{r}{\sqrt{(1-r^2)}} \in t(n-2) \f$
173
174     \return Probability that correlation is larger than \a r by
175     chance when having \a n samples. For \a r larger or equal to 1.0,
176     0.0 is returned. For \a r smaller or equal to -1.0, 1.0 is
177     returned.
178   */
179  double pearson_p_value(double r, unsigned int n);
180
181  ///
182  /// @brief Computes the kurtosis of the data in a vector.
183  ///
184  /// The kurtosis measures how sharply peaked a distribution is,
185  /// relative to its width. The kurtosis is normalized to zero for a
186  /// gaussian distribution.
187  ///
188  double kurtosis(const utility::VectorBase&);
189
190
191  /**
192     \brief Median absolute deviation from median
193
194     Function is non-mutable function
195
196     Type Requirements:
197     - \c RandomAccessIterator must be a \ref
198     concept_data_iterator
199     - \c RandomAccessIterator must be a \random_access_traversal_iterator
200
201     Since 0.6 function also work with a \ref
202     concept_weighted_iterator
203  */
204  template <class RandomAccessIterator>
205  double mad(RandomAccessIterator first, RandomAccessIterator last,
206             bool sorted=false);
207
208
209  /**
210     Median is defined to be value in the middle. If number of values
211     is even median is the average of the two middle values.  the
212     median value is given by p equal to 50. If \a sorted is false
213     (default), the range is copied, the copy is sorted, and then
214     used to calculate the median.
215
216     Function is a non-mutable function, i.e., \a first and \a last
217     can be const_iterators.
218
219     Type Requirements:
220     - \c RandomAccessIterator must be a \ref
221     concept_data_iterator
222     - \c RandomAccessIterator must be a \random_access_traversal_iterator
223
224     \return median of range
225  */
226  template <class RandomAccessIterator>
227  double median(RandomAccessIterator first, RandomAccessIterator last,
228                bool sorted=false);
229
230
231  /**
232     \brief Calculates the mutual information of \a A.
233
234     The elements in A are unnormalized probabilies of the joint
235     distribution.
236
237     The mutual information is calculated as \f$ \sum \sum p(x,y) \log_2
238     \frac {p(x,y)} {p(x)p(y)} \f$ where
239     \f$ p(x,y) = \frac {A_{xy}}{\sum_{x,y} A_{xy}} \f$;
240     \f$ p(x) = \sum_y A_{xy} / \sum_{x,y} A_{xy} \f$;
241     \f$ p(y) = \sum_x A_{xy} / \sum_{x,y} A_{xy} \f$
242
243     Requirements:
244     - \c T must be a model of \ref concept_container_2d
245     - \c T::value_type must be convertible to \c double
246
247     \return mutual information in bits; if you want in natural base
248     multiply with \c M_LN2 (defined in \c gsl/gsl_math.h )
249
250     \since New in yat 0.12
251   */
252  template<class T>
253  double mutual_information(const T& A);
254
255  /**
256     The percentile is determined by the \a p, a number between 0 and
257     100. The percentile is found by interpolation, using the formula
258     \f$ percentile = (1 - \delta) x_i + \delta x_{i+1} \f$ where \a
259     p is floor\f$((n - 1)p/100)\f$ and \f$ \delta \f$ is \f$
260     (n-1)p/100 - i \f$.Thus the minimum value of the vector is given
261     by p equal to zero, the maximum is given by p equal to 100 and
262     the median value is given by p equal to 50. If @a sorted
263     is false (default), the vector is copied, the copy is sorted,
264     and then used to calculate the median.
265
266     Function is a non-mutable function, i.e., \a first and \a last
267     can be const_iterators.
268
269     Requirements: RandomAccessIterator is an iterator over a range of
270     doubles (or any type being convertable to double).
271
272     @return \a p'th percentile of range
273
274     \deprecated percentile2 will replace this function in the future
275
276     \note the definition of percentile used here is not identical to
277     that one used in percentile2 and Percentile. The difference is
278     smaller for large ranges.
279  */
280  template <class RandomAccessIterator>
281  double percentile(RandomAccessIterator first, RandomAccessIterator last,
282                    double p, bool sorted=false) YAT_DEPRECATE;
283
284  /**
285     \see Percentiler
286
287     Type Requirements:
288     - \c RandomAccessIterator must be a \ref
289     concept_data_iterator
290     - \c RandomAccessIterator must be a \random_access_traversal_iterator
291
292     \since new in yat 0.5
293   */
294  template <class RandomAccessIterator>
295  double percentile2(RandomAccessIterator first, RandomAccessIterator last,
296                     double p, bool sorted=false)
297  {
298    Percentiler percentiler(p, sorted);
299    return percentiler(first, last);
300  }
301
302  ///
303  /// @brief Computes the skewness of the data in a vector.
304  ///
305  /// The skewness measures the asymmetry of the tails of a
306  /// distribution.
307  ///
308  double skewness(const utility::VectorBase&);
309
310
311  //////// template implementations ///////////
312
313  template <typename T, typename ForwardIterator>
314  void add(T& o, ForwardIterator first, ForwardIterator last,
315           const classifier::Target& target)
316  {
317    BOOST_CONCEPT_ASSERT((boost_concepts::ReadableIterator<ForwardIterator>));
318    BOOST_CONCEPT_ASSERT((boost_concepts::SinglePassIterator<ForwardIterator>));
319    utility::iterator_traits<ForwardIterator> traits;
320    for (size_t i=0; first!=last; ++i, ++first)
321      o.add(traits.data(first), target.binary(i), traits.weight(first));
322  }
323
324
325  template<typename BidirectionalIterator1, typename BidirectionalIterator2>
326  void benjamini_hochberg(BidirectionalIterator1 first,
327                          BidirectionalIterator1 last,
328                          BidirectionalIterator2 result)
329  {
330    BOOST_CONCEPT_ASSERT((boost_concepts::ReadableIterator<BidirectionalIterator1>));
331    BOOST_CONCEPT_ASSERT((boost_concepts::BidirectionalTraversal<BidirectionalIterator1>));
332
333    BOOST_CONCEPT_ASSERT((boost_concepts::ReadableIterator<BidirectionalIterator2>));
334    BOOST_CONCEPT_ASSERT((boost_concepts::WritableIterator<BidirectionalIterator2>));
335    BOOST_CONCEPT_ASSERT((boost_concepts::BidirectionalTraversal<BidirectionalIterator2>));
336
337    typedef typename boost::iterator_difference<BidirectionalIterator1>::type
338      diff_type;
339
340    diff_type n = std::distance(first, last);
341    if (!n)
342      return;
343    std::advance(result, n-1);
344    first = last;
345    --first;
346    diff_type rank = n;
347
348    double x = 1.0;
349    while (rank) {
350      x = std::min(*first * n/static_cast<double>(rank), x);
351      *result = x;
352      --rank;
353      --first;
354      --result;
355    }
356  }
357
358
359  template<typename RandomAccessIterator, typename MutableRandomAccessIterator>
360  void benjamini_hochberg_unsorted(RandomAccessIterator first,
361                                   RandomAccessIterator last,
362                                   MutableRandomAccessIterator result)
363  {
364    BOOST_CONCEPT_ASSERT((boost_concepts::ReadableIterator<RandomAccessIterator>));
365    BOOST_CONCEPT_ASSERT((boost_concepts::RandomAccessTraversal<RandomAccessIterator>));
366
367    BOOST_CONCEPT_ASSERT((boost_concepts::ReadableIterator<MutableRandomAccessIterator>));
368    BOOST_CONCEPT_ASSERT((boost_concepts::WritableIterator<MutableRandomAccessIterator>));
369    BOOST_CONCEPT_ASSERT((boost_concepts::RandomAccessTraversal<MutableRandomAccessIterator>));
370
371    std::vector<size_t> idx;
372    utility::sort_index(first, last, idx);
373    benjamini_hochberg(boost::make_permutation_iterator(first, idx.begin()),
374                       boost::make_permutation_iterator(first, idx.end()),
375                       boost::make_permutation_iterator(result, idx.begin()));
376  }
377
378
379  template<typename InputIterator>
380  double entropy(InputIterator first, InputIterator last)
381  {
382    BOOST_CONCEPT_ASSERT((boost_concepts::ReadableIterator<InputIterator>));
383    BOOST_CONCEPT_ASSERT((boost_concepts::SinglePassIterator<InputIterator>));
384    using boost::Convertible;
385    typedef typename boost::iterator_value<InputIterator>::type T;
386    BOOST_CONCEPT_ASSERT((Convertible<T,double>));
387    double sum = 0;
388    double N = 0;
389    for (; first != last; ++first) {
390      if (*first) {
391        N += *first;
392        sum += *first * std::log(static_cast<double>(*first));
393      }
394    }
395    return -sum / N + log(N);
396  }
397
398
399  template <class RandomAccessIterator>
400  double mad(RandomAccessIterator first, RandomAccessIterator last,
401             bool sorted)
402  {
403    BOOST_CONCEPT_ASSERT((utility::DataIteratorConcept<RandomAccessIterator>));
404    BOOST_CONCEPT_ASSERT((boost_concepts::RandomAccessTraversal<RandomAccessIterator>));
405    double m = median(first, last, sorted);
406    typedef typename boost::iterator_value<RandomAccessIterator>::type T;
407    std::vector<T> ad(std::distance(first, last));
408    // copy weights if needed
409    normalizer::detail::copy_weight_if_weighted(first, last, ad.begin());
410    // assign data (absolute deviation from median)
411    utility::iterator_traits<RandomAccessIterator> traits;
412    utility::DataIterator<typename std::vector<T>::iterator> first2(ad.begin());
413    while (first!=last) {
414      *first2 = std::abs(traits.data(first)-m);
415      ++first;
416      ++first2;
417    }
418    return median(ad.begin(), ad.end(), false);
419  }
420
421
422  template <class RandomAccessIterator>
423  double median(RandomAccessIterator first, RandomAccessIterator last,
424                bool sorted)
425  {
426    BOOST_CONCEPT_ASSERT((utility::DataIteratorConcept<RandomAccessIterator>));
427    BOOST_CONCEPT_ASSERT((boost_concepts::RandomAccessTraversal<RandomAccessIterator>));
428    return percentile2(first, last, 50.0, sorted);
429  }
430
431
432  template<class T>
433  double mutual_information(const T& n)
434  {
435    BOOST_CONCEPT_ASSERT((utility::Container2D<T>));
436    using boost::Convertible;
437    BOOST_CONCEPT_ASSERT((Convertible<typename T::value_type,double>));
438
439    // p_x = \sum_y p_xy
440
441    // Mutual Information is defined as
442    // \sum_xy p_xy * log (p_xy / (p_x p_y)) =
443    // \sum_xy p_xy * [log p_xy - log p_x - log p_y]
444    // \sum_xy p_xy log p_xy - p_xy log p_x - p_xy log p_y
445    // \sum_xy p_xy log p_xy - \sum_x p_x log p_x - \sum_y p_y log p_y
446    // - entropy_xy + entropy_x + entropy_y
447
448    utility::Vector rowsum(n.columns(), 0);
449    for (size_t c = 0; c<n.columns(); ++c)
450      rowsum(c) = std::accumulate(n.begin_column(c), n.end_column(c), 0);
451
452    utility::Vector colsum(n.rows(), 0);
453    for (size_t r = 0; r<n.rows(); ++r)
454      colsum(r) = std::accumulate(n.begin_row(r), n.end_row(r), 0);
455
456    double mi  = - entropy(n.begin(), n.end());
457    mi += entropy(rowsum.begin(), rowsum.end());
458    mi += entropy(colsum.begin(), colsum.end());
459
460    return mi/M_LN2;
461  }
462
463
464  template <class RandomAccessIterator>
465  double percentile(RandomAccessIterator first, RandomAccessIterator last,
466                    double p, bool sorted)
467  {
468    BOOST_CONCEPT_ASSERT((boost::RandomAccessIterator<RandomAccessIterator>));
469    BOOST_CONCEPT_ASSERT((utility::DataIteratorConcept<RandomAccessIterator>));
470    // range is one value only is a special case
471    if (first+1 == last)
472      return utility::iterator_traits<RandomAccessIterator>().data(first);
473    if (sorted) {
474      // have to take care of this special case
475      if (p>=100)
476        return utility::iterator_traits<RandomAccessIterator>().data(--last);
477      double j = p/100 * (std::distance(first,last)-1);
478      int i = static_cast<int>(j);
479      return (1-j+floor(j))*first[i] + (j-floor(j))*first[i+1];
480    }
481    using std::iterator_traits;
482    typedef typename iterator_traits<RandomAccessIterator>::value_type value_t;
483    std::vector<value_t> v_copy(first, last);
484    size_t i = static_cast<size_t>(p/100 * (v_copy.size()-1));
485    if (i+2 < v_copy.size()) {
486      std::partial_sort(v_copy.begin(), v_copy.begin()+i+2, v_copy.end());
487    }
488    else
489      std::sort(v_copy.begin(), v_copy.end());
490    return percentile(v_copy.begin(), v_copy.end(), p, true);
491  }
492
493
494}}} // of namespace statistics, yat, and theplu
495
496#endif
Note: See TracBrowser for help on using the repository browser.