source: trunk/yat/statistics/utility.h @ 2502

Last change on this file since 2502 was 2502, checked in by Peter, 10 years ago

mad: avoid copying data

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 10.4 KB
Line 
1#ifndef _theplu_yat_statistics_utility_
2#define _theplu_yat_statistics_utility_
3
4// $Id: utility.h 2502 2011-06-28 21:09:54Z peter $
5
6/*
7  Copyright (C) 2004 Jari Häkkinen, Peter Johansson
8  Copyright (C) 2005 Peter Johansson
9  Copyright (C) 2006 Jari Häkkinen, Peter Johansson, Markus Ringnér
10  Copyright (C) 2007, 2008 Jari Häkkinen, Peter Johansson
11  Copyright (C) 2009, 2010, 2011 Peter Johansson
12
13  This file is part of the yat library, http://dev.thep.lu.se/yat
14
15  The yat library is free software; you can redistribute it and/or
16  modify it under the terms of the GNU General Public License as
17  published by the Free Software Foundation; either version 3 of the
18  License, or (at your option) any later version.
19
20  The yat library is distributed in the hope that it will be useful,
21  but WITHOUT ANY WARRANTY; without even the implied warranty of
22  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
23  General Public License for more details.
24
25  You should have received a copy of the GNU General Public License
26  along with yat. If not, see <http://www.gnu.org/licenses/>.
27*/
28
29#include "Percentiler.h"
30
31#include "yat/classifier/DataLookupWeighted1D.h"
32#include "yat/classifier/Target.h"
33#include "yat/normalizer/utility.h"
34#include "yat/utility/concept_check.h"
35#include "yat/utility/DataIterator.h"
36#include "yat/utility/deprecate.h"
37#include "yat/utility/iterator_traits.h"
38#include "yat/utility/VectorBase.h"
39#include "yat/utility/yat_assert.h"
40
41#include <boost/concept_check.hpp>
42
43#include <algorithm>
44#include <cmath>
45#include <iterator>
46#include <stdexcept>
47#include <vector>
48
49#include <gsl/gsl_statistics_double.h>
50
51namespace theplu {
52namespace yat {
53namespace statistics { 
54
55  /**
56     Adding a range [\a first, \a last) into an object of type T. The
57     requirements for the type T is to have an add(double, bool, double)
58     function.
59  */
60  template <typename T, typename ForwardIterator>
61  void add(T& o, ForwardIterator first, ForwardIterator last,
62           const classifier::Target& target);
63
64  /**
65     \brief Benjamini Hochberg multiple test correction
66
67     Given a sorted range of p-values such that
68     \f$ p_1 \le p_2 \le ... \le p_N \f$ a Benjamnini-Hochberg corrected
69     p-value, \c q, is calculated recursively as
70     \f$ q_i = \f$ min \f$(p_i \frac{N}{i}, q_{i+1})\f$ with the anchor
71     constraint that \f$ q_m = p_m \f$.
72
73     Requirements: \c BidirectionalIterator1 should be a
74     \bidirectional_iterator and \c BidirectionalIterator2 should be a
75     mutable \bidirectional_iterator
76
77     \since New in yat 0.8
78   */
79  template<typename BidirectionalIterator1, typename BidirectionalIterator2>
80  void benjamini_hochberg(BidirectionalIterator1 first,
81                          BidirectionalIterator1 last,
82                          BidirectionalIterator2 result);
83                         
84
85  ///
86  /// Calculates the probability to get \a k or smaller from a
87  /// hypergeometric distribution with parameters \a n1 \a n2 \a
88  /// t. Hypergeomtric situation you get in the following situation:
89  /// Let there be \a n1 ways for a "good" selection and \a n2 ways
90  /// for a "bad" selection out of a total of possibilities. Take \a
91  /// t samples without replacement and \a k of those are "good"
92  /// samples. \a k will follow a hypergeomtric distribution.
93  ///
94  /// @return cumulative hypergeomtric distribution functions P(k).
95  ///
96  double cdf_hypergeometric_P(unsigned int k, unsigned int n1, 
97                              unsigned int n2, unsigned int t);
98
99
100  /**
101     \brief one-sided p-value
102
103     This function uses the t-distribution to calculate the one-sided
104     p-value. Given that the true correlation is zero (Null
105     hypothesis) the estimated correlation, r, after a transformation
106     is t-distributed:
107
108     \f$ \sqrt{(n-2)} \frac{r}{\sqrt{(1-r^2)}} \in t(n-2) \f$
109
110     \return Probability that correlation is larger than \a r by
111     chance when having \a n samples. For \a r larger or equal to 1.0,
112     0.0 is returned. For \a r smaller or equal to -1.0, 1.0 is
113     returned.
114   */
115  double pearson_p_value(double r, unsigned int n);
116
117  ///
118  /// @brief Computes the kurtosis of the data in a vector.
119  ///
120  /// The kurtosis measures how sharply peaked a distribution is,
121  /// relative to its width. The kurtosis is normalized to zero for a
122  /// gaussian distribution.
123  ///
124  double kurtosis(const utility::VectorBase&);
125
126
127  ///
128  /// @brief Median absolute deviation from median
129  ///
130  /// Function is non-mutable function
131  ///
132  /// Requirements: \c RandomAccessIterator should be a \ref
133  /// concept_data_iterator and \random_access_iterator
134  ///
135  /// Since 0.6 function also work with a \ref
136  /// concept_weighted_iterator
137  ///
138  template <class RandomAccessIterator>
139  double mad(RandomAccessIterator first, RandomAccessIterator last, 
140             bool sorted=false);
141
142
143  ///
144  /// Median is defined to be value in the middle. If number of values
145  /// is even median is the average of the two middle values.  the
146  /// median value is given by p equal to 50. If \a sorted is false
147  /// (default), the range is copied, the copy is sorted, and then
148  /// used to calculate the median.
149  ///
150  /// Function is a non-mutable function, i.e., \a first and \a last
151  /// can be const_iterators.
152  ///
153  /// Requirements: \c RandomAccessIterator should be a \ref
154  /// concept_data_iterator and \random_access_iterator
155  ///
156  /// @return median of range
157  ///
158  template <class RandomAccessIterator> 
159  double median(RandomAccessIterator first, RandomAccessIterator last, 
160                bool sorted=false); 
161
162
163  /**
164     The percentile is determined by the \a p, a number between 0 and
165     100. The percentile is found by interpolation, using the formula
166     \f$ percentile = (1 - \delta) x_i + \delta x_{i+1} \f$ where \a
167     p is floor\f$((n - 1)p/100)\f$ and \f$ \delta \f$ is \f$
168     (n-1)p/100 - i \f$.Thus the minimum value of the vector is given
169     by p equal to zero, the maximum is given by p equal to 100 and
170     the median value is given by p equal to 50. If @a sorted
171     is false (default), the vector is copied, the copy is sorted,
172     and then used to calculate the median.
173
174     Function is a non-mutable function, i.e., \a first and \a last
175     can be const_iterators.
176     
177     Requirements: RandomAccessIterator is an iterator over a range of
178     doubles (or any type being convertable to double).
179     
180     @return \a p'th percentile of range
181
182     \deprecated percentile2 will replace this function in the future
183
184     \note the definition of percentile used here is not identical to
185     that one used in percentile2 and Percentile. The difference is
186     smaller for large ranges.
187  */
188  template <class RandomAccessIterator>
189  double percentile(RandomAccessIterator first, RandomAccessIterator last, 
190                    double p, bool sorted=false) YAT_DEPRECATE;
191 
192
193  /**
194     \see Percentiler
195     
196     \since new in yat 0.5
197   */
198  template <class RandomAccessIterator>
199  double percentile2(RandomAccessIterator first, RandomAccessIterator last, 
200                     double p, bool sorted=false)
201  {
202    Percentiler percentiler(p, sorted);
203    return percentiler(first, last);
204  }
205
206  ///
207  /// @brief Computes the skewness of the data in a vector.
208  ///
209  /// The skewness measures the asymmetry of the tails of a
210  /// distribution.
211  ///
212  double skewness(const utility::VectorBase&);
213 
214
215  //////// template implementations ///////////
216
217  template <typename T, typename ForwardIterator>
218  void add(T& o, ForwardIterator first, ForwardIterator last,
219           const classifier::Target& target)
220  {
221    utility::iterator_traits<ForwardIterator> traits;
222    for (size_t i=0; first!=last; ++i, ++first)
223      o.add(traits.data(first), target.binary(i), traits.weight(first));
224  } 
225
226
227  template<typename BidirectionalIterator1, typename BidirectionalIterator2>
228  void benjamini_hochberg(BidirectionalIterator1 first,
229                          BidirectionalIterator1 last,
230                          BidirectionalIterator2 result)
231  {
232    using boost::Mutable_BidirectionalIterator;
233    BOOST_CONCEPT_ASSERT((boost::BidirectionalIterator<BidirectionalIterator1>));
234    BOOST_CONCEPT_ASSERT((Mutable_BidirectionalIterator<BidirectionalIterator2>));
235    size_t n = std::distance(first, last);
236    if (!n)
237      return;
238    std::advance(result, n-1);
239    first = last;
240    --first;
241    size_t rank = n;
242   
243    double prev = 1.0;
244    while (rank) {
245      *result = std::min(*first * n/static_cast<double>(rank), prev);
246      prev = *result;
247      --rank;
248      --first;
249      --result;
250    }
251  }
252
253
254  template <class RandomAccessIterator>
255  double mad(RandomAccessIterator first, RandomAccessIterator last, 
256             bool sorted)
257  {
258    BOOST_CONCEPT_ASSERT((boost::RandomAccessIterator<RandomAccessIterator>));
259    BOOST_CONCEPT_ASSERT((utility::DataIteratorConcept<RandomAccessIterator>));
260    double m = median(first, last, sorted);
261    typedef typename std::iterator_traits<RandomAccessIterator>::value_type T;
262    std::vector<T> ad(std::distance(first, last));
263    // copy weights if needed
264    normalizer::detail::copy_weight_if_weighted(first, last, ad.begin());
265    // assign data (absolute deviation from median)
266    utility::iterator_traits<RandomAccessIterator> traits;
267    utility::DataIterator<typename std::vector<T>::iterator> first2(ad.begin());
268    while (first!=last) {
269      *first2 = std::abs(traits.data(first)-m);
270      ++first;
271      ++first2;
272    }
273    std::sort(ad.begin(), ad.end());
274    return median(ad.begin(), ad.end(), true);
275  }
276 
277
278  template <class RandomAccessIterator> 
279  double median(RandomAccessIterator first, RandomAccessIterator last, 
280                bool sorted) 
281  { 
282    return percentile2(first, last, 50.0, sorted); 
283  }
284
285
286  template <class RandomAccessIterator>
287  double percentile(RandomAccessIterator first, RandomAccessIterator last, 
288                    double p, bool sorted=false)
289  {
290    BOOST_CONCEPT_ASSERT((boost::RandomAccessIterator<RandomAccessIterator>));
291    BOOST_CONCEPT_ASSERT((utility::DataIteratorConcept<RandomAccessIterator>));
292    // range is one value only is a special case
293    if (first+1 == last)
294      return utility::iterator_traits<RandomAccessIterator>().data(first);
295    if (sorted) {
296      // have to take care of this special case
297      if (p>=100)
298        return utility::iterator_traits<RandomAccessIterator>().data(--last);
299      double j = p/100 * (std::distance(first,last)-1);
300      int i = static_cast<int>(j);
301      return (1-j+floor(j))*first[i] + (j-floor(j))*first[i+1];
302    }
303
304    std::vector<typename std::iterator_traits<RandomAccessIterator>::value_type> 
305      v_copy;
306    v_copy.reserve(std::distance(first,last));
307    std::copy(first, last, std::back_inserter(v_copy));
308    size_t i = static_cast<size_t>(p/100 * (v_copy.size()-1));
309    if (i+2 < v_copy.size()) {
310      std::partial_sort(v_copy.begin(), v_copy.begin()+i+2, v_copy.end());
311    }
312    else
313      std::sort(v_copy.begin(), v_copy.end());
314    return percentile(v_copy.begin(), v_copy.end(), p, true);
315  }
316
317
318}}} // of namespace statistics, yat, and theplu
319
320#endif
Note: See TracBrowser for help on using the repository browser.