source: trunk/yat/statistics/utility.h @ 3136

Last change on this file since 3136 was 3136, checked in by Peter, 8 years ago

fix docs and remove WS

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 11.2 KB
Line 
1#ifndef _theplu_yat_statistics_utility_
2#define _theplu_yat_statistics_utility_
3
4// $Id: utility.h 3136 2013-11-28 00:18:22Z peter $
5
6/*
7  Copyright (C) 2004 Jari Häkkinen, Peter Johansson
8  Copyright (C) 2005 Peter Johansson
9  Copyright (C) 2006 Jari Häkkinen, Peter Johansson, Markus Ringnér
10  Copyright (C) 2007, 2008 Jari Häkkinen, Peter Johansson
11  Copyright (C) 2009, 2010, 2011, 2013 Peter Johansson
12
13  This file is part of the yat library, http://dev.thep.lu.se/yat
14
15  The yat library is free software; you can redistribute it and/or
16  modify it under the terms of the GNU General Public License as
17  published by the Free Software Foundation; either version 3 of the
18  License, or (at your option) any later version.
19
20  The yat library is distributed in the hope that it will be useful,
21  but WITHOUT ANY WARRANTY; without even the implied warranty of
22  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
23  General Public License for more details.
24
25  You should have received a copy of the GNU General Public License
26  along with yat. If not, see <http://www.gnu.org/licenses/>.
27*/
28
29#include "Percentiler.h"
30
31#include "yat/classifier/DataLookupWeighted1D.h"
32#include "yat/classifier/Target.h"
33#include "yat/normalizer/utility.h"
34#include "yat/utility/concept_check.h"
35#include "yat/utility/DataIterator.h"
36#include "yat/utility/deprecate.h"
37#include "yat/utility/iterator_traits.h"
38#include "yat/utility/VectorBase.h"
39#include "yat/utility/yat_assert.h"
40
41#include <boost/concept_check.hpp>
42
43#include <algorithm>
44#include <cmath>
45#include <iterator>
46#include <stdexcept>
47#include <vector>
48
49#include <gsl/gsl_statistics_double.h>
50
51namespace theplu {
52namespace yat {
53namespace statistics {
54
55  /**
56     Adding a range [\a first, \a last) into an object of type T. The
57     requirements for the type T is to have an add(double, bool, double)
58     function.
59  */
60  template <typename T, typename ForwardIterator>
61  void add(T& o, ForwardIterator first, ForwardIterator last,
62           const classifier::Target& target);
63
64  /**
65     \brief Benjamini Hochberg multiple test correction
66
67     Given a sorted range of p-values such that
68     \f$ p_1 \le p_2 \le ... \le p_N \f$ a Benjamnini-Hochberg corrected
69     p-value, \c q, is calculated recursively as
70     \f$ q_i = \f$ min \f$(p_i \frac{N}{i}, q_{i+1})\f$ with the anchor
71     constraint that \f$ q_m = p_m \f$.
72
73     Requirements: \c BidirectionalIterator1 should be a
74     \bidirectional_iterator and \c BidirectionalIterator2 should be a
75     mutable \bidirectional_iterator
76
77     \since New in yat 0.8
78   */
79  template<typename BidirectionalIterator1, typename BidirectionalIterator2>
80  void benjamini_hochberg(BidirectionalIterator1 first,
81                          BidirectionalIterator1 last,
82                          BidirectionalIterator2 result);
83
84
85  ///
86  /// Calculates the probability to get \a k or smaller from a
87  /// hypergeometric distribution with parameters \a n1 \a n2 \a
88  /// t. Hypergeomtric situation you get in the following situation:
89  /// Let there be \a n1 ways for a "good" selection and \a n2 ways
90  /// for a "bad" selection out of a total of possibilities. Take \a
91  /// t samples without replacement and \a k of those are "good"
92  /// samples. \a k will follow a hypergeomtric distribution.
93  ///
94  /// @return cumulative hypergeomtric distribution functions P(k).
95  ///
96  double cdf_hypergeometric_P(unsigned int k, unsigned int n1, 
97                              unsigned int n2, unsigned int t);
98
99
100  /**
101     The entropy is calculated as \f$ - \sum_i p_i \log p_i \f$ where
102     \f$p_i = \frac{n_i}{\sum_j n_j} \f$
103
104     Requirements:
105     - \c InputIterator should be an \input_iterator
106     - \c InputIterator::value_type must be convertible to \c double
107
108     \since New in yat 0.12
109   */
110  template<typename InputIterator>
111  double entropy(InputIterator first, InputIterator last);
112
113  /**
114     \brief one-sided p-value
115
116     This function uses the t-distribution to calculate the one-sided
117     p-value. Given that the true correlation is zero (Null
118     hypothesis) the estimated correlation, r, after a transformation
119     is t-distributed:
120
121     \f$ \sqrt{(n-2)} \frac{r}{\sqrt{(1-r^2)}} \in t(n-2) \f$
122
123     \return Probability that correlation is larger than \a r by
124     chance when having \a n samples. For \a r larger or equal to 1.0,
125     0.0 is returned. For \a r smaller or equal to -1.0, 1.0 is
126     returned.
127   */
128  double pearson_p_value(double r, unsigned int n);
129
130  ///
131  /// @brief Computes the kurtosis of the data in a vector.
132  ///
133  /// The kurtosis measures how sharply peaked a distribution is,
134  /// relative to its width. The kurtosis is normalized to zero for a
135  /// gaussian distribution.
136  ///
137  double kurtosis(const utility::VectorBase&);
138
139
140  ///
141  /// @brief Median absolute deviation from median
142  ///
143  /// Function is non-mutable function
144  ///
145  /// Requirements: \c RandomAccessIterator should be a \ref
146  /// concept_data_iterator and \random_access_iterator
147  ///
148  /// Since 0.6 function also work with a \ref
149  /// concept_weighted_iterator
150  ///
151  template <class RandomAccessIterator>
152  double mad(RandomAccessIterator first, RandomAccessIterator last,
153             bool sorted=false);
154
155
156  ///
157  /// Median is defined to be value in the middle. If number of values
158  /// is even median is the average of the two middle values.  the
159  /// median value is given by p equal to 50. If \a sorted is false
160  /// (default), the range is copied, the copy is sorted, and then
161  /// used to calculate the median.
162  ///
163  /// Function is a non-mutable function, i.e., \a first and \a last
164  /// can be const_iterators.
165  ///
166  /// Requirements: \c RandomAccessIterator should be a \ref
167  /// concept_data_iterator and \random_access_iterator
168  ///
169  /// @return median of range
170  ///
171  template <class RandomAccessIterator>
172  double median(RandomAccessIterator first, RandomAccessIterator last,
173                bool sorted=false);
174
175
176  /**
177     The percentile is determined by the \a p, a number between 0 and
178     100. The percentile is found by interpolation, using the formula
179     \f$ percentile = (1 - \delta) x_i + \delta x_{i+1} \f$ where \a
180     p is floor\f$((n - 1)p/100)\f$ and \f$ \delta \f$ is \f$
181     (n-1)p/100 - i \f$.Thus the minimum value of the vector is given
182     by p equal to zero, the maximum is given by p equal to 100 and
183     the median value is given by p equal to 50. If @a sorted
184     is false (default), the vector is copied, the copy is sorted,
185     and then used to calculate the median.
186
187     Function is a non-mutable function, i.e., \a first and \a last
188     can be const_iterators.
189
190     Requirements: RandomAccessIterator is an iterator over a range of
191     doubles (or any type being convertable to double).
192
193     @return \a p'th percentile of range
194
195     \deprecated percentile2 will replace this function in the future
196
197     \note the definition of percentile used here is not identical to
198     that one used in percentile2 and Percentile. The difference is
199     smaller for large ranges.
200  */
201  template <class RandomAccessIterator>
202  double percentile(RandomAccessIterator first, RandomAccessIterator last,
203                    double p, bool sorted=false) YAT_DEPRECATE;
204
205  /**
206     \see Percentiler
207
208     \since new in yat 0.5
209   */
210  template <class RandomAccessIterator>
211  double percentile2(RandomAccessIterator first, RandomAccessIterator last,
212                     double p, bool sorted=false)
213  {
214    Percentiler percentiler(p, sorted);
215    return percentiler(first, last);
216  }
217
218  ///
219  /// @brief Computes the skewness of the data in a vector.
220  ///
221  /// The skewness measures the asymmetry of the tails of a
222  /// distribution.
223  ///
224  double skewness(const utility::VectorBase&);
225 
226
227  //////// template implementations ///////////
228
229  template <typename T, typename ForwardIterator>
230  void add(T& o, ForwardIterator first, ForwardIterator last,
231           const classifier::Target& target)
232  {
233    utility::iterator_traits<ForwardIterator> traits;
234    for (size_t i=0; first!=last; ++i, ++first)
235      o.add(traits.data(first), target.binary(i), traits.weight(first));
236  } 
237
238
239  template<typename BidirectionalIterator1, typename BidirectionalIterator2>
240  void benjamini_hochberg(BidirectionalIterator1 first,
241                          BidirectionalIterator1 last,
242                          BidirectionalIterator2 result)
243  {
244    using boost::Mutable_BidirectionalIterator;
245    BOOST_CONCEPT_ASSERT((boost::BidirectionalIterator<BidirectionalIterator1>));
246    BOOST_CONCEPT_ASSERT((Mutable_BidirectionalIterator<BidirectionalIterator2>));
247    size_t n = std::distance(first, last);
248    if (!n)
249      return;
250    std::advance(result, n-1);
251    first = last;
252    --first;
253    size_t rank = n;
254   
255    double prev = 1.0;
256    while (rank) {
257      *result = std::min(*first * n/static_cast<double>(rank), prev);
258      prev = *result;
259      --rank;
260      --first;
261      --result;
262    }
263  }
264
265
266  template<typename InputIterator>
267  double entropy(InputIterator first, InputIterator last)
268  {
269    BOOST_CONCEPT_ASSERT((boost::InputIterator<InputIterator>));
270    using boost::Convertible;
271    typedef typename InputIterator::value_type T;
272    BOOST_CONCEPT_ASSERT((Convertible<T,double>));
273    double sum = 0;
274    double N = 0;
275    for (; first != last; ++first) {
276      if (*first) {
277        N += *first;
278        sum += *first * std::log(static_cast<double>(*first));
279      }
280    }
281    return -sum / N + log(N);
282  }
283
284
285  template <class RandomAccessIterator>
286  double mad(RandomAccessIterator first, RandomAccessIterator last, 
287             bool sorted)
288  {
289    BOOST_CONCEPT_ASSERT((boost::RandomAccessIterator<RandomAccessIterator>));
290    BOOST_CONCEPT_ASSERT((utility::DataIteratorConcept<RandomAccessIterator>));
291    double m = median(first, last, sorted);
292    typedef typename std::iterator_traits<RandomAccessIterator>::value_type T;
293    std::vector<T> ad(std::distance(first, last));
294    // copy weights if needed
295    normalizer::detail::copy_weight_if_weighted(first, last, ad.begin());
296    // assign data (absolute deviation from median)
297    utility::iterator_traits<RandomAccessIterator> traits;
298    utility::DataIterator<typename std::vector<T>::iterator> first2(ad.begin());
299    while (first!=last) {
300      *first2 = std::abs(traits.data(first)-m);
301      ++first;
302      ++first2;
303    }
304    std::sort(ad.begin(), ad.end());
305    return median(ad.begin(), ad.end(), true);
306  }
307 
308
309  template <class RandomAccessIterator> 
310  double median(RandomAccessIterator first, RandomAccessIterator last, 
311                bool sorted) 
312  { 
313    return percentile2(first, last, 50.0, sorted); 
314  }
315
316
317  template <class RandomAccessIterator>
318  double percentile(RandomAccessIterator first, RandomAccessIterator last, 
319                    double p, bool sorted)
320  {
321    BOOST_CONCEPT_ASSERT((boost::RandomAccessIterator<RandomAccessIterator>));
322    BOOST_CONCEPT_ASSERT((utility::DataIteratorConcept<RandomAccessIterator>));
323    // range is one value only is a special case
324    if (first+1 == last)
325      return utility::iterator_traits<RandomAccessIterator>().data(first);
326    if (sorted) {
327      // have to take care of this special case
328      if (p>=100)
329        return utility::iterator_traits<RandomAccessIterator>().data(--last);
330      double j = p/100 * (std::distance(first,last)-1);
331      int i = static_cast<int>(j);
332      return (1-j+floor(j))*first[i] + (j-floor(j))*first[i+1];
333    }
334    using std::iterator_traits;
335    typedef typename iterator_traits<RandomAccessIterator>::value_type value_t;
336    std::vector<value_t> v_copy;
337    v_copy.reserve(std::distance(first,last));
338    std::copy(first, last, std::back_inserter(v_copy));
339    size_t i = static_cast<size_t>(p/100 * (v_copy.size()-1));
340    if (i+2 < v_copy.size()) {
341      std::partial_sort(v_copy.begin(), v_copy.begin()+i+2, v_copy.end());
342    }
343    else
344      std::sort(v_copy.begin(), v_copy.end());
345    return percentile(v_copy.begin(), v_copy.end(), p, true);
346  }
347
348
349}}} // of namespace statistics, yat, and theplu
350
351#endif
Note: See TracBrowser for help on using the repository browser.