source: trunk/yat/statistics/utility.h @ 3135

Last change on this file since 3135 was 3135, checked in by Peter, 9 years ago

add function that calculates entropy from a range

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 11.2 KB
Line 
1#ifndef _theplu_yat_statistics_utility_
2#define _theplu_yat_statistics_utility_
3
4// $Id: utility.h 3135 2013-11-27 07:58:49Z peter $
5
6/*
7  Copyright (C) 2004 Jari Häkkinen, Peter Johansson
8  Copyright (C) 2005 Peter Johansson
9  Copyright (C) 2006 Jari Häkkinen, Peter Johansson, Markus Ringnér
10  Copyright (C) 2007, 2008 Jari Häkkinen, Peter Johansson
11  Copyright (C) 2009, 2010, 2011, 2013 Peter Johansson
12
13  This file is part of the yat library, http://dev.thep.lu.se/yat
14
15  The yat library is free software; you can redistribute it and/or
16  modify it under the terms of the GNU General Public License as
17  published by the Free Software Foundation; either version 3 of the
18  License, or (at your option) any later version.
19
20  The yat library is distributed in the hope that it will be useful,
21  but WITHOUT ANY WARRANTY; without even the implied warranty of
22  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
23  General Public License for more details.
24
25  You should have received a copy of the GNU General Public License
26  along with yat. If not, see <http://www.gnu.org/licenses/>.
27*/
28
29#include "Percentiler.h"
30
31#include "yat/classifier/DataLookupWeighted1D.h"
32#include "yat/classifier/Target.h"
33#include "yat/normalizer/utility.h"
34#include "yat/utility/concept_check.h"
35#include "yat/utility/DataIterator.h"
36#include "yat/utility/deprecate.h"
37#include "yat/utility/iterator_traits.h"
38#include "yat/utility/VectorBase.h"
39#include "yat/utility/yat_assert.h"
40
41#include <boost/concept_check.hpp>
42
43#include <algorithm>
44#include <cmath>
45#include <iterator>
46#include <stdexcept>
47#include <vector>
48
49#include <gsl/gsl_statistics_double.h>
50
51namespace theplu {
52namespace yat {
53namespace statistics {
54
55  /**
56     Adding a range [\a first, \a last) into an object of type T. The
57     requirements for the type T is to have an add(double, bool, double)
58     function.
59  */
60  template <typename T, typename ForwardIterator>
61  void add(T& o, ForwardIterator first, ForwardIterator last,
62           const classifier::Target& target);
63
64  /**
65     \brief Benjamini Hochberg multiple test correction
66
67     Given a sorted range of p-values such that
68     \f$ p_1 \le p_2 \le ... \le p_N \f$ a Benjamnini-Hochberg corrected
69     p-value, \c q, is calculated recursively as
70     \f$ q_i = \f$ min \f$(p_i \frac{N}{i}, q_{i+1})\f$ with the anchor
71     constraint that \f$ q_m = p_m \f$.
72
73     Requirements: \c BidirectionalIterator1 should be a
74     \bidirectional_iterator and \c BidirectionalIterator2 should be a
75     mutable \bidirectional_iterator
76
77     \since New in yat 0.8
78   */
79  template<typename BidirectionalIterator1, typename BidirectionalIterator2>
80  void benjamini_hochberg(BidirectionalIterator1 first,
81                          BidirectionalIterator1 last,
82                          BidirectionalIterator2 result);
83                         
84
85  ///
86  /// Calculates the probability to get \a k or smaller from a
87  /// hypergeometric distribution with parameters \a n1 \a n2 \a
88  /// t. Hypergeomtric situation you get in the following situation:
89  /// Let there be \a n1 ways for a "good" selection and \a n2 ways
90  /// for a "bad" selection out of a total of possibilities. Take \a
91  /// t samples without replacement and \a k of those are "good"
92  /// samples. \a k will follow a hypergeomtric distribution.
93  ///
94  /// @return cumulative hypergeomtric distribution functions P(k).
95  ///
96  double cdf_hypergeometric_P(unsigned int k, unsigned int n1, 
97                              unsigned int n2, unsigned int t);
98
99
100  /**
101     The entropy is calculated as \f$ \sum_i p_i \log p_i \f$ where
102     \f$p_i = \frac{n_i}{\sum_j n_j} \f$
103
104     Requirements: \c InputIterator should be an \input_iterator
105     Requirements: \c InputIterator::value_type must be convertible to \c double
106   */
107  template<typename InputIterator>
108  double entropy(InputIterator first, InputIterator last);
109
110  /**
111     \brief one-sided p-value
112
113     This function uses the t-distribution to calculate the one-sided
114     p-value. Given that the true correlation is zero (Null
115     hypothesis) the estimated correlation, r, after a transformation
116     is t-distributed:
117
118     \f$ \sqrt{(n-2)} \frac{r}{\sqrt{(1-r^2)}} \in t(n-2) \f$
119
120     \return Probability that correlation is larger than \a r by
121     chance when having \a n samples. For \a r larger or equal to 1.0,
122     0.0 is returned. For \a r smaller or equal to -1.0, 1.0 is
123     returned.
124   */
125  double pearson_p_value(double r, unsigned int n);
126
127  ///
128  /// @brief Computes the kurtosis of the data in a vector.
129  ///
130  /// The kurtosis measures how sharply peaked a distribution is,
131  /// relative to its width. The kurtosis is normalized to zero for a
132  /// gaussian distribution.
133  ///
134  double kurtosis(const utility::VectorBase&);
135
136
137  ///
138  /// @brief Median absolute deviation from median
139  ///
140  /// Function is non-mutable function
141  ///
142  /// Requirements: \c RandomAccessIterator should be a \ref
143  /// concept_data_iterator and \random_access_iterator
144  ///
145  /// Since 0.6 function also work with a \ref
146  /// concept_weighted_iterator
147  ///
148  template <class RandomAccessIterator>
149  double mad(RandomAccessIterator first, RandomAccessIterator last, 
150             bool sorted=false);
151
152
153  ///
154  /// Median is defined to be value in the middle. If number of values
155  /// is even median is the average of the two middle values.  the
156  /// median value is given by p equal to 50. If \a sorted is false
157  /// (default), the range is copied, the copy is sorted, and then
158  /// used to calculate the median.
159  ///
160  /// Function is a non-mutable function, i.e., \a first and \a last
161  /// can be const_iterators.
162  ///
163  /// Requirements: \c RandomAccessIterator should be a \ref
164  /// concept_data_iterator and \random_access_iterator
165  ///
166  /// @return median of range
167  ///
168  template <class RandomAccessIterator> 
169  double median(RandomAccessIterator first, RandomAccessIterator last, 
170                bool sorted=false); 
171
172
173  /**
174     The percentile is determined by the \a p, a number between 0 and
175     100. The percentile is found by interpolation, using the formula
176     \f$ percentile = (1 - \delta) x_i + \delta x_{i+1} \f$ where \a
177     p is floor\f$((n - 1)p/100)\f$ and \f$ \delta \f$ is \f$
178     (n-1)p/100 - i \f$.Thus the minimum value of the vector is given
179     by p equal to zero, the maximum is given by p equal to 100 and
180     the median value is given by p equal to 50. If @a sorted
181     is false (default), the vector is copied, the copy is sorted,
182     and then used to calculate the median.
183
184     Function is a non-mutable function, i.e., \a first and \a last
185     can be const_iterators.
186     
187     Requirements: RandomAccessIterator is an iterator over a range of
188     doubles (or any type being convertable to double).
189     
190     @return \a p'th percentile of range
191
192     \deprecated percentile2 will replace this function in the future
193
194     \note the definition of percentile used here is not identical to
195     that one used in percentile2 and Percentile. The difference is
196     smaller for large ranges.
197  */
198  template <class RandomAccessIterator>
199  double percentile(RandomAccessIterator first, RandomAccessIterator last, 
200                    double p, bool sorted=false) YAT_DEPRECATE;
201
202  /**
203     \see Percentiler
204     
205     \since new in yat 0.5
206   */
207  template <class RandomAccessIterator>
208  double percentile2(RandomAccessIterator first, RandomAccessIterator last, 
209                     double p, bool sorted=false)
210  {
211    Percentiler percentiler(p, sorted);
212    return percentiler(first, last);
213  }
214
215  ///
216  /// @brief Computes the skewness of the data in a vector.
217  ///
218  /// The skewness measures the asymmetry of the tails of a
219  /// distribution.
220  ///
221  double skewness(const utility::VectorBase&);
222 
223
224  //////// template implementations ///////////
225
226  template <typename T, typename ForwardIterator>
227  void add(T& o, ForwardIterator first, ForwardIterator last,
228           const classifier::Target& target)
229  {
230    utility::iterator_traits<ForwardIterator> traits;
231    for (size_t i=0; first!=last; ++i, ++first)
232      o.add(traits.data(first), target.binary(i), traits.weight(first));
233  } 
234
235
236  template<typename BidirectionalIterator1, typename BidirectionalIterator2>
237  void benjamini_hochberg(BidirectionalIterator1 first,
238                          BidirectionalIterator1 last,
239                          BidirectionalIterator2 result)
240  {
241    using boost::Mutable_BidirectionalIterator;
242    BOOST_CONCEPT_ASSERT((boost::BidirectionalIterator<BidirectionalIterator1>));
243    BOOST_CONCEPT_ASSERT((Mutable_BidirectionalIterator<BidirectionalIterator2>));
244    size_t n = std::distance(first, last);
245    if (!n)
246      return;
247    std::advance(result, n-1);
248    first = last;
249    --first;
250    size_t rank = n;
251   
252    double prev = 1.0;
253    while (rank) {
254      *result = std::min(*first * n/static_cast<double>(rank), prev);
255      prev = *result;
256      --rank;
257      --first;
258      --result;
259    }
260  }
261
262
263  template<typename InputIterator>
264  double entropy(InputIterator first, InputIterator last)
265  {
266    BOOST_CONCEPT_ASSERT((boost::InputIterator<InputIterator>));
267    using boost::Convertible;
268    typedef typename InputIterator::value_type T;
269    BOOST_CONCEPT_ASSERT((Convertible<T,double>));
270    double sum = 0;
271    double N = 0;
272    for (; first != last; ++first) {
273      if (*first) {
274        N += *first;
275        sum += *first * std::log(static_cast<double>(*first));
276      }
277    }
278    return -sum / N + log(N);
279  }
280
281
282  template <class RandomAccessIterator>
283  double mad(RandomAccessIterator first, RandomAccessIterator last, 
284             bool sorted)
285  {
286    BOOST_CONCEPT_ASSERT((boost::RandomAccessIterator<RandomAccessIterator>));
287    BOOST_CONCEPT_ASSERT((utility::DataIteratorConcept<RandomAccessIterator>));
288    double m = median(first, last, sorted);
289    typedef typename std::iterator_traits<RandomAccessIterator>::value_type T;
290    std::vector<T> ad(std::distance(first, last));
291    // copy weights if needed
292    normalizer::detail::copy_weight_if_weighted(first, last, ad.begin());
293    // assign data (absolute deviation from median)
294    utility::iterator_traits<RandomAccessIterator> traits;
295    utility::DataIterator<typename std::vector<T>::iterator> first2(ad.begin());
296    while (first!=last) {
297      *first2 = std::abs(traits.data(first)-m);
298      ++first;
299      ++first2;
300    }
301    std::sort(ad.begin(), ad.end());
302    return median(ad.begin(), ad.end(), true);
303  }
304 
305
306  template <class RandomAccessIterator> 
307  double median(RandomAccessIterator first, RandomAccessIterator last, 
308                bool sorted) 
309  { 
310    return percentile2(first, last, 50.0, sorted); 
311  }
312
313
314  template <class RandomAccessIterator>
315  double percentile(RandomAccessIterator first, RandomAccessIterator last, 
316                    double p, bool sorted)
317  {
318    BOOST_CONCEPT_ASSERT((boost::RandomAccessIterator<RandomAccessIterator>));
319    BOOST_CONCEPT_ASSERT((utility::DataIteratorConcept<RandomAccessIterator>));
320    // range is one value only is a special case
321    if (first+1 == last)
322      return utility::iterator_traits<RandomAccessIterator>().data(first);
323    if (sorted) {
324      // have to take care of this special case
325      if (p>=100)
326        return utility::iterator_traits<RandomAccessIterator>().data(--last);
327      double j = p/100 * (std::distance(first,last)-1);
328      int i = static_cast<int>(j);
329      return (1-j+floor(j))*first[i] + (j-floor(j))*first[i+1];
330    }
331    using std::iterator_traits;
332    typedef typename iterator_traits<RandomAccessIterator>::value_type value_t;
333    std::vector<value_t> v_copy;
334    v_copy.reserve(std::distance(first,last));
335    std::copy(first, last, std::back_inserter(v_copy));
336    size_t i = static_cast<size_t>(p/100 * (v_copy.size()-1));
337    if (i+2 < v_copy.size()) {
338      std::partial_sort(v_copy.begin(), v_copy.begin()+i+2, v_copy.end());
339    }
340    else
341      std::sort(v_copy.begin(), v_copy.end());
342    return percentile(v_copy.begin(), v_copy.end(), p, true);
343  }
344
345
346}}} // of namespace statistics, yat, and theplu
347
348#endif
Note: See TracBrowser for help on using the repository browser.