source: trunk/yat/statistics/utility.h @ 3245

Last change on this file since 3245 was 3245, checked in by Peter, 9 years ago

avoid sorting entire range; instead let Percentiler be clever

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 14.8 KB
Line 
1#ifndef _theplu_yat_statistics_utility_
2#define _theplu_yat_statistics_utility_
3
4// $Id: utility.h 3245 2014-05-24 17:01:31Z peter $
5
6/*
7  Copyright (C) 2004 Jari Häkkinen, Peter Johansson
8  Copyright (C) 2005 Peter Johansson
9  Copyright (C) 2006 Jari Häkkinen, Peter Johansson, Markus Ringnér
10  Copyright (C) 2007, 2008 Jari Häkkinen, Peter Johansson
11  Copyright (C) 2009, 2010, 2011, 2013, 2014 Peter Johansson
12
13  This file is part of the yat library, http://dev.thep.lu.se/yat
14
15  The yat library is free software; you can redistribute it and/or
16  modify it under the terms of the GNU General Public License as
17  published by the Free Software Foundation; either version 3 of the
18  License, or (at your option) any later version.
19
20  The yat library is distributed in the hope that it will be useful,
21  but WITHOUT ANY WARRANTY; without even the implied warranty of
22  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
23  General Public License for more details.
24
25  You should have received a copy of the GNU General Public License
26  along with yat. If not, see <http://www.gnu.org/licenses/>.
27*/
28
29#include "Percentiler.h"
30
31#include "yat/classifier/DataLookupWeighted1D.h"
32#include "yat/classifier/Target.h"
33#include "yat/normalizer/utility.h"
34#include "yat/utility/concept_check.h"
35#include "yat/utility/DataIterator.h"
36#include "yat/utility/deprecate.h"
37#include "yat/utility/iterator_traits.h"
38#include "yat/utility/sort_index.h"
39#include "yat/utility/Vector.h"
40#include "yat/utility/VectorBase.h"
41#include "yat/utility/yat_assert.h"
42
43#include <boost/concept_check.hpp>
44#include <boost/iterator/permutation_iterator.hpp>
45
46#include <gsl/gsl_math.h>
47#include <gsl/gsl_statistics_double.h>
48
49#include <algorithm>
50#include <cmath>
51#include <iterator>
52#include <stdexcept>
53#include <vector>
54
55namespace theplu {
56namespace yat {
57namespace statistics {
58
59  /**
60     Adding a range [\a first, \a last) into an object of type T. The
61     requirements for the type T is to have an add(double, bool, double)
62     function.
63  */
64  template <typename T, typename ForwardIterator>
65  void add(T& o, ForwardIterator first, ForwardIterator last,
66           const classifier::Target& target);
67
68  /**
69     \brief Benjamini Hochberg multiple test correction
70
71     Given a sorted range of p-values such that
72     \f$ p_1 \le p_2 \le ... \le p_N \f$ a Benjamnini-Hochberg corrected
73     p-value, \c q, is calculated recursively as
74     \f$ q_i = \f$ min \f$(p_i \frac{N}{i}, q_{i+1})\f$ with the anchor
75     constraint that \f$ q_m = p_m \f$.
76
77     Type Requirements:
78     - \c BidirectionalIterator1 is a \bidirectional_iterator
79     - \c BidirectionalIterator1::value type is convertible to \c double
80     - \c BidirectionalIterator2 is a mutable \bidirectional_iterator
81     - \c double is convertible to \c BidirectionalIterator2::value_type
82
83     \since New in yat 0.8
84   */
85  template<typename BidirectionalIterator1, typename BidirectionalIterator2>
86  void benjamini_hochberg(BidirectionalIterator1 first,
87                          BidirectionalIterator1 last,
88                          BidirectionalIterator2 result);
89
90
91  /**
92     \brief Benjamini Hochberg multiple test correction
93
94     Similar to benjamini_hochberg() but does not assume that input
95     range, [first, last), is sorted. The resulting range is the same
96     as if sorting input range, call benjamini_hochberg, and unsort
97     the result range.
98
99     Type Requirements:
100     - \c RandomAccessIterator models a \random_access_iterator
101     - \c RandomAccessIterator::value type is convertible to \c double
102     - \c MutableRandomAccessIterator models a mutable \random_access_iterator
103     - \c double is convertible to \c MutableRandomAccessIterator::value_type
104
105     \since New in yat 0.13
106   */
107  template<typename RandomAccessIterator, typename MutableRandomAccessIterator>
108  void benjamini_hochberg_unsorted(RandomAccessIterator first,
109                                   RandomAccessIterator last,
110                                   MutableRandomAccessIterator result);
111
112
113  ///
114  /// Calculates the probability to get \a k or smaller from a
115  /// hypergeometric distribution with parameters \a n1 \a n2 \a
116  /// t. Hypergeomtric situation you get in the following situation:
117  /// Let there be \a n1 ways for a "good" selection and \a n2 ways
118  /// for a "bad" selection out of a total of possibilities. Take \a
119  /// t samples without replacement and \a k of those are "good"
120  /// samples. \a k will follow a hypergeomtric distribution.
121  ///
122  /// @return cumulative hypergeomtric distribution functions P(k).
123  ///
124  double cdf_hypergeometric_P(unsigned int k, unsigned int n1,
125                              unsigned int n2, unsigned int t);
126
127
128  /**
129     The entropy is calculated as \f$ - \sum_i p_i \log p_i \f$ where
130     \f$p_i = \frac{n_i}{\sum_j n_j} \f$
131
132     Requirements:
133     - \c InputIterator should be an \input_iterator
134     - \c InputIterator::value_type must be convertible to \c double
135
136     \since New in yat 0.12
137   */
138  template<typename InputIterator>
139  double entropy(InputIterator first, InputIterator last);
140
141  /**
142     \brief one-sided p-value
143
144     This function uses the t-distribution to calculate the one-sided
145     p-value. Given that the true correlation is zero (Null
146     hypothesis) the estimated correlation, r, after a transformation
147     is t-distributed:
148
149     \f$ \sqrt{(n-2)} \frac{r}{\sqrt{(1-r^2)}} \in t(n-2) \f$
150
151     \return Probability that correlation is larger than \a r by
152     chance when having \a n samples. For \a r larger or equal to 1.0,
153     0.0 is returned. For \a r smaller or equal to -1.0, 1.0 is
154     returned.
155   */
156  double pearson_p_value(double r, unsigned int n);
157
158  ///
159  /// @brief Computes the kurtosis of the data in a vector.
160  ///
161  /// The kurtosis measures how sharply peaked a distribution is,
162  /// relative to its width. The kurtosis is normalized to zero for a
163  /// gaussian distribution.
164  ///
165  double kurtosis(const utility::VectorBase&);
166
167
168  ///
169  /// @brief Median absolute deviation from median
170  ///
171  /// Function is non-mutable function
172  ///
173  /// Requirements: \c RandomAccessIterator should be a \ref
174  /// concept_data_iterator and \random_access_iterator
175  ///
176  /// Since 0.6 function also work with a \ref
177  /// concept_weighted_iterator
178  ///
179  template <class RandomAccessIterator>
180  double mad(RandomAccessIterator first, RandomAccessIterator last,
181             bool sorted=false);
182
183
184  ///
185  /// Median is defined to be value in the middle. If number of values
186  /// is even median is the average of the two middle values.  the
187  /// median value is given by p equal to 50. If \a sorted is false
188  /// (default), the range is copied, the copy is sorted, and then
189  /// used to calculate the median.
190  ///
191  /// Function is a non-mutable function, i.e., \a first and \a last
192  /// can be const_iterators.
193  ///
194  /// Requirements: \c RandomAccessIterator should be a \ref
195  /// concept_data_iterator and \random_access_iterator
196  ///
197  /// @return median of range
198  ///
199  template <class RandomAccessIterator>
200  double median(RandomAccessIterator first, RandomAccessIterator last,
201                bool sorted=false);
202
203
204  /**
205     \brief Calculates the mutual information of \a A.
206
207     The elements in A are unnormalized probabilies of the joint
208     distribution.
209
210     The mutual information is calculated as \f$ \sum \sum p(x,y) \log_2
211     \frac {p(x,y)} {p(x)p(y)} \f$ where
212     \f$ p(x,y) = \frac {A_{xy}}{\sum_{x,y} A_{xy}} \f$;
213     \f$ p(x) = \sum_y A_{xy} / \sum_{x,y} A_{xy} \f$;
214     \f$ p(y) = \sum_x A_{xy} / \sum_{x,y} A_{xy} \f$
215
216     Requirements:
217     - \c T must be a model of \ref concept_container_2d
218     - \c T::value_type must be convertible to \c double
219
220     \return mutual information in bits; if you want in natural base
221     multiply with \c M_LN2 (defined in \c gsl/gsl_math.h )
222
223     \since New in yat 0.12
224   */
225  template<class T>
226  double mutual_information(const T& A);
227
228  /**
229     The percentile is determined by the \a p, a number between 0 and
230     100. The percentile is found by interpolation, using the formula
231     \f$ percentile = (1 - \delta) x_i + \delta x_{i+1} \f$ where \a
232     p is floor\f$((n - 1)p/100)\f$ and \f$ \delta \f$ is \f$
233     (n-1)p/100 - i \f$.Thus the minimum value of the vector is given
234     by p equal to zero, the maximum is given by p equal to 100 and
235     the median value is given by p equal to 50. If @a sorted
236     is false (default), the vector is copied, the copy is sorted,
237     and then used to calculate the median.
238
239     Function is a non-mutable function, i.e., \a first and \a last
240     can be const_iterators.
241
242     Requirements: RandomAccessIterator is an iterator over a range of
243     doubles (or any type being convertable to double).
244
245     @return \a p'th percentile of range
246
247     \deprecated percentile2 will replace this function in the future
248
249     \note the definition of percentile used here is not identical to
250     that one used in percentile2 and Percentile. The difference is
251     smaller for large ranges.
252  */
253  template <class RandomAccessIterator>
254  double percentile(RandomAccessIterator first, RandomAccessIterator last,
255                    double p, bool sorted=false) YAT_DEPRECATE;
256
257  /**
258     \see Percentiler
259
260     \since new in yat 0.5
261   */
262  template <class RandomAccessIterator>
263  double percentile2(RandomAccessIterator first, RandomAccessIterator last,
264                     double p, bool sorted=false)
265  {
266    Percentiler percentiler(p, sorted);
267    return percentiler(first, last);
268  }
269
270  ///
271  /// @brief Computes the skewness of the data in a vector.
272  ///
273  /// The skewness measures the asymmetry of the tails of a
274  /// distribution.
275  ///
276  double skewness(const utility::VectorBase&);
277
278
279  //////// template implementations ///////////
280
281  template <typename T, typename ForwardIterator>
282  void add(T& o, ForwardIterator first, ForwardIterator last,
283           const classifier::Target& target)
284  {
285    utility::iterator_traits<ForwardIterator> traits;
286    for (size_t i=0; first!=last; ++i, ++first)
287      o.add(traits.data(first), target.binary(i), traits.weight(first));
288  }
289
290
291  template<typename BidirectionalIterator1, typename BidirectionalIterator2>
292  void benjamini_hochberg(BidirectionalIterator1 first,
293                          BidirectionalIterator1 last,
294                          BidirectionalIterator2 result)
295  {
296    using boost::Mutable_BidirectionalIterator;
297    using boost::BidirectionalIterator;
298    BOOST_CONCEPT_ASSERT((BidirectionalIterator<BidirectionalIterator1>));
299    BOOST_CONCEPT_ASSERT((Mutable_BidirectionalIterator<BidirectionalIterator2>));
300    typedef typename std::iterator_traits<BidirectionalIterator1> traits;
301    typename traits::difference_type n = std::distance(first, last);
302    if (!n)
303      return;
304    std::advance(result, n-1);
305    first = last;
306    --first;
307    typename traits::difference_type rank = n;
308
309    double prev = 1.0;
310    while (rank) {
311      *result = std::min(*first * n/static_cast<double>(rank), prev);
312      prev = *result;
313      --rank;
314      --first;
315      --result;
316    }
317  }
318
319
320  template<typename RandomAccessIterator, typename MutableRandomAccessIterator>
321  void benjamini_hochberg_unsorted(RandomAccessIterator first,
322                                   RandomAccessIterator last,
323                                   MutableRandomAccessIterator result)
324  {
325    BOOST_CONCEPT_ASSERT((boost::RandomAccessIterator<RandomAccessIterator>));
326    using boost::Mutable_RandomAccessIterator;
327    BOOST_CONCEPT_ASSERT((Mutable_RandomAccessIterator<MutableRandomAccessIterator>));
328
329    std::vector<size_t> idx;
330    utility::sort_index(first, last, idx);
331    benjamini_hochberg(boost::make_permutation_iterator(first, idx.begin()),
332                       boost::make_permutation_iterator(first, idx.end()),
333                       boost::make_permutation_iterator(result, idx.begin()));
334  }
335
336
337  template<typename InputIterator>
338  double entropy(InputIterator first, InputIterator last)
339  {
340    BOOST_CONCEPT_ASSERT((boost::InputIterator<InputIterator>));
341    using boost::Convertible;
342    typedef typename InputIterator::value_type T;
343    BOOST_CONCEPT_ASSERT((Convertible<T,double>));
344    double sum = 0;
345    double N = 0;
346    for (; first != last; ++first) {
347      if (*first) {
348        N += *first;
349        sum += *first * std::log(static_cast<double>(*first));
350      }
351    }
352    return -sum / N + log(N);
353  }
354
355
356  template <class RandomAccessIterator>
357  double mad(RandomAccessIterator first, RandomAccessIterator last,
358             bool sorted)
359  {
360    BOOST_CONCEPT_ASSERT((boost::RandomAccessIterator<RandomAccessIterator>));
361    BOOST_CONCEPT_ASSERT((utility::DataIteratorConcept<RandomAccessIterator>));
362    double m = median(first, last, sorted);
363    typedef typename std::iterator_traits<RandomAccessIterator>::value_type T;
364    std::vector<T> ad(std::distance(first, last));
365    // copy weights if needed
366    normalizer::detail::copy_weight_if_weighted(first, last, ad.begin());
367    // assign data (absolute deviation from median)
368    utility::iterator_traits<RandomAccessIterator> traits;
369    utility::DataIterator<typename std::vector<T>::iterator> first2(ad.begin());
370    while (first!=last) {
371      *first2 = std::abs(traits.data(first)-m);
372      ++first;
373      ++first2;
374    }
375    return median(ad.begin(), ad.end(), false);
376  }
377
378
379  template <class RandomAccessIterator>
380  double median(RandomAccessIterator first, RandomAccessIterator last,
381                bool sorted)
382  {
383    return percentile2(first, last, 50.0, sorted);
384  }
385
386
387  template<class T>
388  double mutual_information(const T& n)
389  {
390    BOOST_CONCEPT_ASSERT((utility::Container2D<T>));
391    using boost::Convertible;
392    BOOST_CONCEPT_ASSERT((Convertible<typename T::value_type,double>));
393
394    // p_x = \sum_y p_xy
395
396    // Mutual Information is defined as
397    // \sum_xy p_xy * log (p_xy / (p_x p_y)) =
398    // \sum_xy p_xy * [log p_xy - log p_x - log p_y]
399    // \sum_xy p_xy log p_xy - p_xy log p_x - p_xy log p_y
400    // \sum_xy p_xy log p_xy - \sum_x p_x log p_x - \sum_y p_y log p_y
401    // - entropy_xy + entropy_x + entropy_y
402
403    utility::Vector rowsum(n.columns(), 0);
404    for (size_t c = 0; c<n.columns(); ++c)
405      rowsum(c) = std::accumulate(n.begin_column(c), n.end_column(c), 0);
406
407    utility::Vector colsum(n.rows(), 0);
408    for (size_t r = 0; r<n.rows(); ++r)
409      colsum(r) = std::accumulate(n.begin_row(r), n.end_row(r), 0);
410
411    double mi  = - entropy(n.begin(), n.end());
412    mi += entropy(rowsum.begin(), rowsum.end());
413    mi += entropy(colsum.begin(), colsum.end());
414
415    return mi/M_LN2;
416  }
417
418
419  template <class RandomAccessIterator>
420  double percentile(RandomAccessIterator first, RandomAccessIterator last,
421                    double p, bool sorted)
422  {
423    BOOST_CONCEPT_ASSERT((boost::RandomAccessIterator<RandomAccessIterator>));
424    BOOST_CONCEPT_ASSERT((utility::DataIteratorConcept<RandomAccessIterator>));
425    // range is one value only is a special case
426    if (first+1 == last)
427      return utility::iterator_traits<RandomAccessIterator>().data(first);
428    if (sorted) {
429      // have to take care of this special case
430      if (p>=100)
431        return utility::iterator_traits<RandomAccessIterator>().data(--last);
432      double j = p/100 * (std::distance(first,last)-1);
433      int i = static_cast<int>(j);
434      return (1-j+floor(j))*first[i] + (j-floor(j))*first[i+1];
435    }
436    using std::iterator_traits;
437    typedef typename iterator_traits<RandomAccessIterator>::value_type value_t;
438    std::vector<value_t> v_copy(first, last);
439    size_t i = static_cast<size_t>(p/100 * (v_copy.size()-1));
440    if (i+2 < v_copy.size()) {
441      std::partial_sort(v_copy.begin(), v_copy.begin()+i+2, v_copy.end());
442    }
443    else
444      std::sort(v_copy.begin(), v_copy.end());
445    return percentile(v_copy.begin(), v_copy.end(), p, true);
446  }
447
448
449}}} // of namespace statistics, yat, and theplu
450
451#endif
Note: See TracBrowser for help on using the repository browser.