source: trunk/yat/statistics/utility.h @ 3550

Last change on this file since 3550 was 3550, checked in by Peter, 5 years ago

Update copyright years. Happy New Year

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 16.4 KB
Line 
1#ifndef _theplu_yat_statistics_utility_
2#define _theplu_yat_statistics_utility_
3
4// $Id: utility.h 3550 2017-01-03 05:41:02Z peter $
5
6/*
7  Copyright (C) 2004 Jari Häkkinen, Peter Johansson
8  Copyright (C) 2005 Peter Johansson
9  Copyright (C) 2006 Jari Häkkinen, Peter Johansson, Markus Ringnér
10  Copyright (C) 2007, 2008 Jari Häkkinen, Peter Johansson
11  Copyright (C) 2009, 2010, 2011, 2013, 2014, 2016 Peter Johansson
12
13  This file is part of the yat library, http://dev.thep.lu.se/yat
14
15  The yat library is free software; you can redistribute it and/or
16  modify it under the terms of the GNU General Public License as
17  published by the Free Software Foundation; either version 3 of the
18  License, or (at your option) any later version.
19
20  The yat library is distributed in the hope that it will be useful,
21  but WITHOUT ANY WARRANTY; without even the implied warranty of
22  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
23  General Public License for more details.
24
25  You should have received a copy of the GNU General Public License
26  along with yat. If not, see <http://www.gnu.org/licenses/>.
27*/
28
29#include "Percentiler.h"
30
31#include "yat/classifier/DataLookupWeighted1D.h"
32#include "yat/classifier/Target.h"
33#include "yat/normalizer/utility.h"
34#include "yat/utility/concept_check.h"
35#include "yat/utility/DataIterator.h"
36#include "yat/utility/deprecate.h"
37#include "yat/utility/iterator_traits.h"
38#include "yat/utility/sort_index.h"
39#include "yat/utility/Vector.h"
40#include "yat/utility/VectorBase.h"
41#include "yat/utility/yat_assert.h"
42
43#include <boost/concept_check.hpp>
44#include <boost/iterator/iterator_concepts.hpp>
45#include <boost/iterator/iterator_traits.hpp>
46#include <boost/iterator/permutation_iterator.hpp>
47
48#include <gsl/gsl_math.h>
49#include <gsl/gsl_statistics_double.h>
50
51#include <algorithm>
52#include <cmath>
53#include <iterator>
54#include <stdexcept>
55#include <vector>
56
57namespace theplu {
58namespace yat {
59namespace statistics {
60
61  /**
62     Adding a range [\a first, \a last) into an object of type T. The
63     requirements for the type T is to have an add(double, bool, double)
64     function.
65
66     Type Requirements:
67     - \c ForwardIterator is a \readable_iterator
68     - \c ForwardIterator is \single_pass_iterator
69  */
70  template <typename T, typename ForwardIterator>
71  void add(T& o, ForwardIterator first, ForwardIterator last,
72           const classifier::Target& target);
73
74  /**
75     \brief Benjamini Hochberg multiple test correction
76
77     Given a sorted range of p-values such that
78     \f$ p_1 \le p_2 \le ... \le p_N \f$ a Benjamnini-Hochberg corrected
79     p-value, \c q, is calculated recursively as
80     \f$ q_i = \f$ min \f$(p_i \frac{N}{i}, q_{i+1})\f$ with the anchor
81     constraint that \f$ q_m = p_m \f$.
82
83     Type Requirements:
84     - \c BidirectionalIterator1 is a \readable_iterator
85     - \c BidirectionalIterator1 is a \bidirectional_traversal_iterator
86     - \c BidirectionalIterator1::value type is convertible to \c double
87     - \c BidirectionalIterator2 is a \readable_iterator
88     - \c BidirectionalIterator2 is a \writable_iterator
89     - \c BidirectionalIterator2 is a \bidirectional_traversal_iterator
90     - \c double is convertible to \c BidirectionalIterator2::value_type
91
92     \since New in yat 0.8
93   */
94  template<typename BidirectionalIterator1, typename BidirectionalIterator2>
95  void benjamini_hochberg(BidirectionalIterator1 first,
96                          BidirectionalIterator1 last,
97                          BidirectionalIterator2 result);
98
99
100  /**
101     \brief Benjamini Hochberg multiple test correction
102
103     Similar to benjamini_hochberg() but does not assume that input
104     range, [first, last), is sorted. The resulting range is the same
105     as if sorting input range, call benjamini_hochberg, and unsort
106     the result range.
107
108     Type Requirements:
109     - \c RandomAccessIterator is a \readable_iterator
110     - \c RandomAccessIterator is a \random_access_traversal_iterator
111     - \c RandomAccessIterator::value type is convertible to \c double
112     - \c MutableRandomAccessIterator is a \readable_iterator
113     - \c MutableRandomAccessIterator is a \writable_iterator
114     - \c MutableRandomAccessIterator is a \random_access_traversal_iterator
115     - \c double is convertible to \c MutableRandomAccessIterator::value_type
116
117     \since New in yat 0.13
118   */
119  template<typename RandomAccessIterator, typename MutableRandomAccessIterator>
120  void benjamini_hochberg_unsorted(RandomAccessIterator first,
121                                   RandomAccessIterator last,
122                                   MutableRandomAccessIterator result);
123
124
125  ///
126  /// Calculates the probability to get \a k or smaller from a
127  /// hypergeometric distribution with parameters \a n1 \a n2 \a
128  /// t. Hypergeomtric situation you get in the following situation:
129  /// Let there be \a n1 ways for a "good" selection and \a n2 ways
130  /// for a "bad" selection out of a total of possibilities. Take \a
131  /// t samples without replacement and \a k of those are "good"
132  /// samples. \a k will follow a hypergeomtric distribution.
133  ///
134  /// @return cumulative hypergeomtric distribution functions P(k).
135  ///
136  double cdf_hypergeometric_P(unsigned int k, unsigned int n1,
137                              unsigned int n2, unsigned int t);
138
139
140  /**
141     The entropy is calculated as \f$ - \sum_i p_i \log p_i \f$ where
142     \f$p_i = \frac{n_i}{\sum_j n_j} \f$
143
144     Requirements:
145     - \c InputIterator must be a \readable_iterator
146     - \c InputIterator must be a \single_pass_iterator
147     - \c InputIterator::value_type must be convertible to \c double
148
149     \since New in yat 0.12
150   */
151  template<typename InputIterator>
152  double entropy(InputIterator first, InputIterator last);
153
154  /**
155     \brief one-sided p-value
156
157     This function uses the t-distribution to calculate the one-sided
158     p-value. Given that the true correlation is zero (Null
159     hypothesis) the estimated correlation, r, after a transformation
160     is t-distributed:
161
162     \f$ \sqrt{(n-2)} \frac{r}{\sqrt{(1-r^2)}} \in t(n-2) \f$
163
164     \return Probability that correlation is larger than \a r by
165     chance when having \a n samples. For \a r larger or equal to 1.0,
166     0.0 is returned. For \a r smaller or equal to -1.0, 1.0 is
167     returned.
168   */
169  double pearson_p_value(double r, unsigned int n);
170
171  ///
172  /// @brief Computes the kurtosis of the data in a vector.
173  ///
174  /// The kurtosis measures how sharply peaked a distribution is,
175  /// relative to its width. The kurtosis is normalized to zero for a
176  /// gaussian distribution.
177  ///
178  double kurtosis(const utility::VectorBase&);
179
180
181  /**
182     \brief Median absolute deviation from median
183
184     Function is non-mutable function
185
186     Type Requirements:
187     - \c RandomAccessIterator must be a \ref
188     concept_data_iterator
189     - \c RandomAccessIterator must be a \random_access_traversal_iterator
190
191     Since 0.6 function also work with a \ref
192     concept_weighted_iterator
193  */
194  template <class RandomAccessIterator>
195  double mad(RandomAccessIterator first, RandomAccessIterator last,
196             bool sorted=false);
197
198
199  /**
200     Median is defined to be value in the middle. If number of values
201     is even median is the average of the two middle values.  the
202     median value is given by p equal to 50. If \a sorted is false
203     (default), the range is copied, the copy is sorted, and then
204     used to calculate the median.
205
206     Function is a non-mutable function, i.e., \a first and \a last
207     can be const_iterators.
208
209     Type Requirements:
210     - \c RandomAccessIterator must be a \ref
211     concept_data_iterator
212     - \c RandomAccessIterator must be a \random_access_traversal_iterator
213
214     \return median of range
215  */
216  template <class RandomAccessIterator>
217  double median(RandomAccessIterator first, RandomAccessIterator last,
218                bool sorted=false);
219
220
221  /**
222     \brief Calculates the mutual information of \a A.
223
224     The elements in A are unnormalized probabilies of the joint
225     distribution.
226
227     The mutual information is calculated as \f$ \sum \sum p(x,y) \log_2
228     \frac {p(x,y)} {p(x)p(y)} \f$ where
229     \f$ p(x,y) = \frac {A_{xy}}{\sum_{x,y} A_{xy}} \f$;
230     \f$ p(x) = \sum_y A_{xy} / \sum_{x,y} A_{xy} \f$;
231     \f$ p(y) = \sum_x A_{xy} / \sum_{x,y} A_{xy} \f$
232
233     Requirements:
234     - \c T must be a model of \ref concept_container_2d
235     - \c T::value_type must be convertible to \c double
236
237     \return mutual information in bits; if you want in natural base
238     multiply with \c M_LN2 (defined in \c gsl/gsl_math.h )
239
240     \since New in yat 0.12
241   */
242  template<class T>
243  double mutual_information(const T& A);
244
245  /**
246     The percentile is determined by the \a p, a number between 0 and
247     100. The percentile is found by interpolation, using the formula
248     \f$ percentile = (1 - \delta) x_i + \delta x_{i+1} \f$ where \a
249     p is floor\f$((n - 1)p/100)\f$ and \f$ \delta \f$ is \f$
250     (n-1)p/100 - i \f$.Thus the minimum value of the vector is given
251     by p equal to zero, the maximum is given by p equal to 100 and
252     the median value is given by p equal to 50. If @a sorted
253     is false (default), the vector is copied, the copy is sorted,
254     and then used to calculate the median.
255
256     Function is a non-mutable function, i.e., \a first and \a last
257     can be const_iterators.
258
259     Requirements: RandomAccessIterator is an iterator over a range of
260     doubles (or any type being convertable to double).
261
262     @return \a p'th percentile of range
263
264     \deprecated percentile2 will replace this function in the future
265
266     \note the definition of percentile used here is not identical to
267     that one used in percentile2 and Percentile. The difference is
268     smaller for large ranges.
269  */
270  template <class RandomAccessIterator>
271  double percentile(RandomAccessIterator first, RandomAccessIterator last,
272                    double p, bool sorted=false) YAT_DEPRECATE;
273
274  /**
275     \see Percentiler
276
277     Type Requirements:
278     - \c RandomAccessIterator must be a \ref
279     concept_data_iterator
280     - \c RandomAccessIterator must be a \random_access_traversal_iterator
281
282     \since new in yat 0.5
283   */
284  template <class RandomAccessIterator>
285  double percentile2(RandomAccessIterator first, RandomAccessIterator last,
286                     double p, bool sorted=false)
287  {
288    Percentiler percentiler(p, sorted);
289    return percentiler(first, last);
290  }
291
292  ///
293  /// @brief Computes the skewness of the data in a vector.
294  ///
295  /// The skewness measures the asymmetry of the tails of a
296  /// distribution.
297  ///
298  double skewness(const utility::VectorBase&);
299
300
301  //////// template implementations ///////////
302
303  template <typename T, typename ForwardIterator>
304  void add(T& o, ForwardIterator first, ForwardIterator last,
305           const classifier::Target& target)
306  {
307    BOOST_CONCEPT_ASSERT((boost_concepts::ReadableIterator<ForwardIterator>));
308    BOOST_CONCEPT_ASSERT((boost_concepts::SinglePassIterator<ForwardIterator>));
309    utility::iterator_traits<ForwardIterator> traits;
310    for (size_t i=0; first!=last; ++i, ++first)
311      o.add(traits.data(first), target.binary(i), traits.weight(first));
312  }
313
314
315  template<typename BidirectionalIterator1, typename BidirectionalIterator2>
316  void benjamini_hochberg(BidirectionalIterator1 first,
317                          BidirectionalIterator1 last,
318                          BidirectionalIterator2 result)
319  {
320    BOOST_CONCEPT_ASSERT((boost_concepts::ReadableIterator<BidirectionalIterator1>));
321    BOOST_CONCEPT_ASSERT((boost_concepts::BidirectionalTraversal<BidirectionalIterator1>));
322
323    BOOST_CONCEPT_ASSERT((boost_concepts::ReadableIterator<BidirectionalIterator2>));
324    BOOST_CONCEPT_ASSERT((boost_concepts::WritableIterator<BidirectionalIterator2>));
325    BOOST_CONCEPT_ASSERT((boost_concepts::BidirectionalTraversal<BidirectionalIterator2>));
326
327    typedef typename boost::iterator_difference<BidirectionalIterator1>::type
328      diff_type;
329
330    diff_type n = std::distance(first, last);
331    if (!n)
332      return;
333    std::advance(result, n-1);
334    first = last;
335    --first;
336    diff_type rank = n;
337
338    double x = 1.0;
339    while (rank) {
340      x = std::min(*first * n/static_cast<double>(rank), x);
341      *result = x;
342      --rank;
343      --first;
344      --result;
345    }
346  }
347
348
349  template<typename RandomAccessIterator, typename MutableRandomAccessIterator>
350  void benjamini_hochberg_unsorted(RandomAccessIterator first,
351                                   RandomAccessIterator last,
352                                   MutableRandomAccessIterator result)
353  {
354    BOOST_CONCEPT_ASSERT((boost_concepts::ReadableIterator<RandomAccessIterator>));
355    BOOST_CONCEPT_ASSERT((boost_concepts::RandomAccessTraversal<RandomAccessIterator>));
356
357    BOOST_CONCEPT_ASSERT((boost_concepts::ReadableIterator<MutableRandomAccessIterator>));
358    BOOST_CONCEPT_ASSERT((boost_concepts::WritableIterator<MutableRandomAccessIterator>));
359    BOOST_CONCEPT_ASSERT((boost_concepts::RandomAccessTraversal<MutableRandomAccessIterator>));
360
361    std::vector<size_t> idx;
362    utility::sort_index(first, last, idx);
363    benjamini_hochberg(boost::make_permutation_iterator(first, idx.begin()),
364                       boost::make_permutation_iterator(first, idx.end()),
365                       boost::make_permutation_iterator(result, idx.begin()));
366  }
367
368
369  template<typename InputIterator>
370  double entropy(InputIterator first, InputIterator last)
371  {
372    BOOST_CONCEPT_ASSERT((boost_concepts::ReadableIterator<InputIterator>));
373    BOOST_CONCEPT_ASSERT((boost_concepts::SinglePassIterator<InputIterator>));
374    using boost::Convertible;
375    typedef typename boost::iterator_value<InputIterator>::type T;
376    BOOST_CONCEPT_ASSERT((Convertible<T,double>));
377    double sum = 0;
378    double N = 0;
379    for (; first != last; ++first) {
380      if (*first) {
381        N += *first;
382        sum += *first * std::log(static_cast<double>(*first));
383      }
384    }
385    return -sum / N + log(N);
386  }
387
388
389  template <class RandomAccessIterator>
390  double mad(RandomAccessIterator first, RandomAccessIterator last,
391             bool sorted)
392  {
393    BOOST_CONCEPT_ASSERT((utility::DataIteratorConcept<RandomAccessIterator>));
394    BOOST_CONCEPT_ASSERT((boost_concepts::RandomAccessTraversal<RandomAccessIterator>));
395    double m = median(first, last, sorted);
396    typedef typename boost::iterator_value<RandomAccessIterator>::type T;
397    std::vector<T> ad(std::distance(first, last));
398    // copy weights if needed
399    normalizer::detail::copy_weight_if_weighted(first, last, ad.begin());
400    // assign data (absolute deviation from median)
401    utility::iterator_traits<RandomAccessIterator> traits;
402    utility::DataIterator<typename std::vector<T>::iterator> first2(ad.begin());
403    while (first!=last) {
404      *first2 = std::abs(traits.data(first)-m);
405      ++first;
406      ++first2;
407    }
408    return median(ad.begin(), ad.end(), false);
409  }
410
411
412  template <class RandomAccessIterator>
413  double median(RandomAccessIterator first, RandomAccessIterator last,
414                bool sorted)
415  {
416    BOOST_CONCEPT_ASSERT((utility::DataIteratorConcept<RandomAccessIterator>));
417    BOOST_CONCEPT_ASSERT((boost_concepts::RandomAccessTraversal<RandomAccessIterator>));
418    return percentile2(first, last, 50.0, sorted);
419  }
420
421
422  template<class T>
423  double mutual_information(const T& n)
424  {
425    BOOST_CONCEPT_ASSERT((utility::Container2D<T>));
426    using boost::Convertible;
427    BOOST_CONCEPT_ASSERT((Convertible<typename T::value_type,double>));
428
429    // p_x = \sum_y p_xy
430
431    // Mutual Information is defined as
432    // \sum_xy p_xy * log (p_xy / (p_x p_y)) =
433    // \sum_xy p_xy * [log p_xy - log p_x - log p_y]
434    // \sum_xy p_xy log p_xy - p_xy log p_x - p_xy log p_y
435    // \sum_xy p_xy log p_xy - \sum_x p_x log p_x - \sum_y p_y log p_y
436    // - entropy_xy + entropy_x + entropy_y
437
438    utility::Vector rowsum(n.columns(), 0);
439    for (size_t c = 0; c<n.columns(); ++c)
440      rowsum(c) = std::accumulate(n.begin_column(c), n.end_column(c), 0);
441
442    utility::Vector colsum(n.rows(), 0);
443    for (size_t r = 0; r<n.rows(); ++r)
444      colsum(r) = std::accumulate(n.begin_row(r), n.end_row(r), 0);
445
446    double mi  = - entropy(n.begin(), n.end());
447    mi += entropy(rowsum.begin(), rowsum.end());
448    mi += entropy(colsum.begin(), colsum.end());
449
450    return mi/M_LN2;
451  }
452
453
454  template <class RandomAccessIterator>
455  double percentile(RandomAccessIterator first, RandomAccessIterator last,
456                    double p, bool sorted)
457  {
458    BOOST_CONCEPT_ASSERT((boost::RandomAccessIterator<RandomAccessIterator>));
459    BOOST_CONCEPT_ASSERT((utility::DataIteratorConcept<RandomAccessIterator>));
460    // range is one value only is a special case
461    if (first+1 == last)
462      return utility::iterator_traits<RandomAccessIterator>().data(first);
463    if (sorted) {
464      // have to take care of this special case
465      if (p>=100)
466        return utility::iterator_traits<RandomAccessIterator>().data(--last);
467      double j = p/100 * (std::distance(first,last)-1);
468      int i = static_cast<int>(j);
469      return (1-j+floor(j))*first[i] + (j-floor(j))*first[i+1];
470    }
471    using std::iterator_traits;
472    typedef typename iterator_traits<RandomAccessIterator>::value_type value_t;
473    std::vector<value_t> v_copy(first, last);
474    size_t i = static_cast<size_t>(p/100 * (v_copy.size()-1));
475    if (i+2 < v_copy.size()) {
476      std::partial_sort(v_copy.begin(), v_copy.begin()+i+2, v_copy.end());
477    }
478    else
479      std::sort(v_copy.begin(), v_copy.end());
480    return percentile(v_copy.begin(), v_copy.end(), p, true);
481  }
482
483
484}}} // of namespace statistics, yat, and theplu
485
486#endif
Note: See TracBrowser for help on using the repository browser.