source: trunk/yat/statistics/utility.h @ 2496

Last change on this file since 2496 was 2496, checked in by Peter, 10 years ago

benjamini-hochberg: avoid crash when input is empty range. Speed-up when iterator is not random access.

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 10.3 KB
Line 
1#ifndef _theplu_yat_statistics_utility_
2#define _theplu_yat_statistics_utility_
3
4// $Id: utility.h 2496 2011-06-22 14:31:25Z peter $
5
6/*
7  Copyright (C) 2004 Jari Häkkinen, Peter Johansson
8  Copyright (C) 2005 Peter Johansson
9  Copyright (C) 2006 Jari Häkkinen, Peter Johansson, Markus Ringnér
10  Copyright (C) 2007, 2008 Jari Häkkinen, Peter Johansson
11  Copyright (C) 2009, 2010, 2011 Peter Johansson
12
13  This file is part of the yat library, http://dev.thep.lu.se/yat
14
15  The yat library is free software; you can redistribute it and/or
16  modify it under the terms of the GNU General Public License as
17  published by the Free Software Foundation; either version 3 of the
18  License, or (at your option) any later version.
19
20  The yat library is distributed in the hope that it will be useful,
21  but WITHOUT ANY WARRANTY; without even the implied warranty of
22  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
23  General Public License for more details.
24
25  You should have received a copy of the GNU General Public License
26  along with yat. If not, see <http://www.gnu.org/licenses/>.
27*/
28
29#include "Percentiler.h"
30
31#include "yat/classifier/DataLookupWeighted1D.h"
32#include "yat/classifier/Target.h"
33#include "yat/utility/concept_check.h"
34#include "yat/utility/deprecate.h"
35#include "yat/utility/iterator_traits.h"
36#include "yat/utility/VectorBase.h"
37#include "yat/utility/yat_assert.h"
38
39#include <boost/concept_check.hpp>
40
41#include <algorithm>
42#include <cmath>
43#include <iterator>
44#include <stdexcept>
45#include <vector>
46
47#include <gsl/gsl_statistics_double.h>
48
49namespace theplu {
50namespace yat {
51namespace statistics { 
52
53  /**
54     Adding a range [\a first, \a last) into an object of type T. The
55     requirements for the type T is to have an add(double, bool, double)
56     function.
57  */
58  template <typename T, typename ForwardIterator>
59  void add(T& o, ForwardIterator first, ForwardIterator last,
60           const classifier::Target& target);
61
62  /**
63     \brief Benjamini Hochberg multiple test correction
64
65     Given a sorted range of p-values such that
66     \f$ p_1 \le p_2 \le ... \le p_N \f$ a Benjamnini-Hochberg corrected
67     p-value, \c q, is calculated recursively as
68     \f$ q_i = \f$ min \f$(p_i \frac{N}{i}, q_{i+1})\f$ with the anchor
69     constraint that \f$ q_m = p_m \f$.
70
71     Requirements: \c BidirectionalIterator1 should be a
72     \bidirectional_iterator and \c BidirectionalIterator2 should be a
73     mutable \bidirectional_iterator
74
75     \since New in yat 0.8
76   */
77  template<typename BidirectionalIterator1, typename BidirectionalIterator2>
78  void benjamini_hochberg(BidirectionalIterator1 first,
79                          BidirectionalIterator1 last,
80                          BidirectionalIterator2 result);
81                         
82
83  ///
84  /// Calculates the probability to get \a k or smaller from a
85  /// hypergeometric distribution with parameters \a n1 \a n2 \a
86  /// t. Hypergeomtric situation you get in the following situation:
87  /// Let there be \a n1 ways for a "good" selection and \a n2 ways
88  /// for a "bad" selection out of a total of possibilities. Take \a
89  /// t samples without replacement and \a k of those are "good"
90  /// samples. \a k will follow a hypergeomtric distribution.
91  ///
92  /// @return cumulative hypergeomtric distribution functions P(k).
93  ///
94  double cdf_hypergeometric_P(unsigned int k, unsigned int n1, 
95                              unsigned int n2, unsigned int t);
96
97
98  /**
99     \brief one-sided p-value
100
101     This function uses the t-distribution to calculate the one-sided
102     p-value. Given that the true correlation is zero (Null
103     hypothesis) the estimated correlation, r, after a transformation
104     is t-distributed:
105
106     \f$ \sqrt{(n-2)} \frac{r}{\sqrt{(1-r^2)}} \in t(n-2) \f$
107
108     \return Probability that correlation is larger than \a r by
109     chance when having \a n samples. For \a r larger or equal to 1.0,
110     0.0 is returned. For \a r smaller or equal to -1.0, 1.0 is
111     returned.
112   */
113  double pearson_p_value(double r, unsigned int n);
114
115  ///
116  /// @brief Computes the kurtosis of the data in a vector.
117  ///
118  /// The kurtosis measures how sharply peaked a distribution is,
119  /// relative to its width. The kurtosis is normalized to zero for a
120  /// gaussian distribution.
121  ///
122  double kurtosis(const utility::VectorBase&);
123
124
125  ///
126  /// @brief Median absolute deviation from median
127  ///
128  /// Function is non-mutable function
129  ///
130  /// Requirements: \c RandomAccessIterator should be a \ref
131  /// concept_data_iterator and \random_access_iterator
132  ///
133  /// Since 0.6 function also work with a \ref
134  /// concept_weighted_iterator
135  ///
136  template <class RandomAccessIterator>
137  double mad(RandomAccessIterator first, RandomAccessIterator last, 
138             bool sorted=false);
139
140
141  ///
142  /// Median is defined to be value in the middle. If number of values
143  /// is even median is the average of the two middle values.  the
144  /// median value is given by p equal to 50. If \a sorted is false
145  /// (default), the range is copied, the copy is sorted, and then
146  /// used to calculate the median.
147  ///
148  /// Function is a non-mutable function, i.e., \a first and \a last
149  /// can be const_iterators.
150  ///
151  /// Requirements: \c RandomAccessIterator should be a \ref
152  /// concept_data_iterator and \random_access_iterator
153  ///
154  /// @return median of range
155  ///
156  template <class RandomAccessIterator> 
157  double median(RandomAccessIterator first, RandomAccessIterator last, 
158                bool sorted=false); 
159
160
161  /**
162     The percentile is determined by the \a p, a number between 0 and
163     100. The percentile is found by interpolation, using the formula
164     \f$ percentile = (1 - \delta) x_i + \delta x_{i+1} \f$ where \a
165     p is floor\f$((n - 1)p/100)\f$ and \f$ \delta \f$ is \f$
166     (n-1)p/100 - i \f$.Thus the minimum value of the vector is given
167     by p equal to zero, the maximum is given by p equal to 100 and
168     the median value is given by p equal to 50. If @a sorted
169     is false (default), the vector is copied, the copy is sorted,
170     and then used to calculate the median.
171
172     Function is a non-mutable function, i.e., \a first and \a last
173     can be const_iterators.
174     
175     Requirements: RandomAccessIterator is an iterator over a range of
176     doubles (or any type being convertable to double).
177     
178     @return \a p'th percentile of range
179
180     \deprecated percentile2 will replace this function in the future
181
182     \note the definition of percentile used here is not identical to
183     that one used in percentile2 and Percentile. The difference is
184     smaller for large ranges.
185  */
186  template <class RandomAccessIterator>
187  double percentile(RandomAccessIterator first, RandomAccessIterator last, 
188                    double p, bool sorted=false) YAT_DEPRECATE;
189 
190
191  /**
192     \see Percentiler
193     
194     \since new in yat 0.5
195   */
196  template <class RandomAccessIterator>
197  double percentile2(RandomAccessIterator first, RandomAccessIterator last, 
198                     double p, bool sorted=false)
199  {
200    Percentiler percentiler(p, sorted);
201    return percentiler(first, last);
202  }
203
204  ///
205  /// @brief Computes the skewness of the data in a vector.
206  ///
207  /// The skewness measures the asymmetry of the tails of a
208  /// distribution.
209  ///
210  double skewness(const utility::VectorBase&);
211 
212
213  //////// template implementations ///////////
214
215  template <typename T, typename ForwardIterator>
216  void add(T& o, ForwardIterator first, ForwardIterator last,
217           const classifier::Target& target)
218  {
219    utility::iterator_traits<ForwardIterator> traits;
220    for (size_t i=0; first!=last; ++i, ++first)
221      o.add(traits.data(first), target.binary(i), traits.weight(first));
222  } 
223
224
225  template<typename BidirectionalIterator1, typename BidirectionalIterator2>
226  void benjamini_hochberg(BidirectionalIterator1 first,
227                          BidirectionalIterator1 last,
228                          BidirectionalIterator2 result)
229  {
230    using boost::Mutable_BidirectionalIterator;
231    BOOST_CONCEPT_ASSERT((boost::BidirectionalIterator<BidirectionalIterator1>));
232    BOOST_CONCEPT_ASSERT((Mutable_BidirectionalIterator<BidirectionalIterator2>));
233    size_t n = std::distance(first, last);
234    if (!n)
235      return;
236    std::advance(result, n-1);
237    first = last;
238    --first;
239    size_t rank = n;
240   
241    double prev = 1.0;
242    while (rank) {
243      *result = std::min(*first * n/static_cast<double>(rank), prev);
244      prev = *result;
245      --rank;
246      --first;
247      --result;
248    }
249  }
250
251
252  template <class RandomAccessIterator>
253  double mad(RandomAccessIterator first, RandomAccessIterator last, 
254             bool sorted)
255  {
256    BOOST_CONCEPT_ASSERT((boost::RandomAccessIterator<RandomAccessIterator>));
257    BOOST_CONCEPT_ASSERT((utility::DataIteratorConcept<RandomAccessIterator>));
258    double m = median(first, last, sorted);
259    typedef typename std::iterator_traits<RandomAccessIterator>::value_type T;
260    typedef std::vector<T> Vec;
261    Vec ad;
262    ad.reserve(std::distance(first, last));
263    // copy values (and weights if RandomAccessIterator is weighted)
264    std::copy(first, last, std::back_insert_iterator<Vec>(ad));
265    utility::iterator_traits<typename Vec::iterator> traits;
266    for (typename Vec::iterator i=ad.begin() ; i!=ad.end(); ++i) {
267      // adjust data
268      traits.data(i) = std::abs(traits.data(i)-m);
269    }
270    std::sort(ad.begin(), ad.end());
271    return median(ad.begin(), ad.end(), true);
272  }
273 
274
275  template <class RandomAccessIterator> 
276  double median(RandomAccessIterator first, RandomAccessIterator last, 
277                bool sorted) 
278  { 
279    return percentile2(first, last, 50.0, sorted); 
280  }
281
282
283  template <class RandomAccessIterator>
284  double percentile(RandomAccessIterator first, RandomAccessIterator last, 
285                    double p, bool sorted=false)
286  {
287    BOOST_CONCEPT_ASSERT((boost::RandomAccessIterator<RandomAccessIterator>));
288    BOOST_CONCEPT_ASSERT((utility::DataIteratorConcept<RandomAccessIterator>));
289    // range is one value only is a special case
290    if (first+1 == last)
291      return utility::iterator_traits<RandomAccessIterator>().data(first);
292    if (sorted) {
293      // have to take care of this special case
294      if (p>=100)
295        return utility::iterator_traits<RandomAccessIterator>().data(--last);
296      double j = p/100 * (std::distance(first,last)-1);
297      int i = static_cast<int>(j);
298      return (1-j+floor(j))*first[i] + (j-floor(j))*first[i+1];
299    }
300
301    std::vector<typename std::iterator_traits<RandomAccessIterator>::value_type> 
302      v_copy;
303    v_copy.reserve(std::distance(first,last));
304    std::copy(first, last, std::back_inserter(v_copy));
305    size_t i = static_cast<size_t>(p/100 * (v_copy.size()-1));
306    if (i+2 < v_copy.size()) {
307      std::partial_sort(v_copy.begin(), v_copy.begin()+i+2, v_copy.end());
308    }
309    else
310      std::sort(v_copy.begin(), v_copy.end());
311    return percentile(v_copy.begin(), v_copy.end(), p, true);
312  }
313
314
315}}} // of namespace statistics, yat, and theplu
316
317#endif
Note: See TracBrowser for help on using the repository browser.