source: trunk/yat/statistics/utility.h @ 2118

Last change on this file since 2118 was 2118, checked in by Peter, 12 years ago

mention that since yat 0.6 mad works with weighted iterators

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 8.1 KB
Line 
1#ifndef _theplu_yat_statistics_utility_
2#define _theplu_yat_statistics_utility_
3
4// $Id: utility.h 2118 2009-12-08 18:38:38Z peter $
5
6/*
7  Copyright (C) 2004 Jari Häkkinen, Peter Johansson
8  Copyright (C) 2005 Peter Johansson
9  Copyright (C) 2006 Jari Häkkinen, Peter Johansson, Markus Ringnér
10  Copyright (C) 2007, 2008 Jari Häkkinen, Peter Johansson
11  Copyright (C) 2009 Peter Johansson
12
13  This file is part of the yat library, http://dev.thep.lu.se/yat
14
15  The yat library is free software; you can redistribute it and/or
16  modify it under the terms of the GNU General Public License as
17  published by the Free Software Foundation; either version 3 of the
18  License, or (at your option) any later version.
19
20  The yat library is distributed in the hope that it will be useful,
21  but WITHOUT ANY WARRANTY; without even the implied warranty of
22  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
23  General Public License for more details.
24
25  You should have received a copy of the GNU General Public License
26  along with yat. If not, see <http://www.gnu.org/licenses/>.
27*/
28
29#include "Percentiler.h"
30
31#include "yat/classifier/DataLookupWeighted1D.h"
32#include "yat/classifier/Target.h"
33#include "yat/utility/deprecate.h"
34#include "yat/utility/iterator_traits.h"
35#include "yat/utility/VectorBase.h"
36#include "yat/utility/yat_assert.h"
37
38#include <algorithm>
39#include <cmath>
40#include <stdexcept>
41#include <vector>
42
43#include <gsl/gsl_statistics_double.h>
44
45namespace theplu {
46namespace yat {
47namespace statistics { 
48
49  /**
50     Adding a range [\a first, \a last) into an object of type T. The
51     requirements for the type T is to have an add(double, bool, double)
52     function.
53  */
54  template <typename T, typename ForwardIterator>
55  void add(T& o, ForwardIterator first, ForwardIterator last,
56           const classifier::Target& target);
57
58  ///
59  /// Calculates the probability to get \a k or smaller from a
60  /// hypergeometric distribution with parameters \a n1 \a n2 \a
61  /// t. Hypergeomtric situation you get in the following situation:
62  /// Let there be \a n1 ways for a "good" selection and \a n2 ways
63  /// for a "bad" selection out of a total of possibilities. Take \a
64  /// t samples without replacement and \a k of those are "good"
65  /// samples. \a k will follow a hypergeomtric distribution.
66  ///
67  /// @return cumulative hypergeomtric distribution functions P(k).
68  ///
69  double cdf_hypergeometric_P(unsigned int k, unsigned int n1, 
70                              unsigned int n2, unsigned int t);
71
72
73  /**
74     \brief one-sided p-value
75
76     This function uses the t-distribution to calculate the one-sided
77     p-value. Given that the true correlation is zero (Null
78     hypothesis) the estimated correlation, r, after a transformation
79     is t-distributed:
80
81     \f$ \sqrt{(n-2)} \frac{r}{\sqrt{(1-r^2)}} \in t(n-2) \f$
82
83     \return Probability that correlation is larger than \a r by
84     chance when having \a n samples. For \a r larger or equal to 1.0,
85     0.0 is returned. For \a r smaller or equal to -1.0, 1.0 is
86     returned.
87   */
88  double pearson_p_value(double r, unsigned int n);
89
90  ///
91  /// @brief Computes the kurtosis of the data in a vector.
92  ///
93  /// The kurtosis measures how sharply peaked a distribution is,
94  /// relative to its width. The kurtosis is normalized to zero for a
95  /// gaussian distribution.
96  ///
97  double kurtosis(const utility::VectorBase&);
98
99
100  ///
101  /// @brief Median absolute deviation from median
102  ///
103  /// Function is non-mutable function
104  ///
105  /// Since 0.6 function also work with a \ref
106  /// concept_weighted_iterator
107  ///
108  template <class T>
109  double mad(T first, T last, const bool sorted=false);
110
111
112  ///
113  /// Median is defined to be value in the middle. If number of values
114  /// is even median is the average of the two middle values.  the
115  /// median value is given by p equal to 50. If \a sorted is false
116  /// (default), the range is copied, the copy is sorted, and then
117  /// used to calculate the median.
118  ///
119  /// Function is a non-mutable function, i.e., \a first and \a last
120  /// can be const_iterators.
121  ///
122  /// Requirements: \c T should either be unweighted with value type
123  /// convertible to \c double or \c T should be weighted with value
124  /// type convertible to utility::DataWeight
125  ///
126  /// @return median of range
127  ///
128  template <class T> 
129  double median(T first, T last, const bool sorted=false); 
130
131
132  /**
133     The percentile is determined by the \a p, a number between 0 and
134     100. The percentile is found by interpolation, using the formula
135     \f$ percentile = (1 - \delta) x_i + \delta x_{i+1} \f$ where \a
136     p is floor\f$((n - 1)p/100)\f$ and \f$ \delta \f$ is \f$
137     (n-1)p/100 - i \f$.Thus the minimum value of the vector is given
138     by p equal to zero, the maximum is given by p equal to 100 and
139     the median value is given by p equal to 50. If @a sorted
140     is false (default), the vector is copied, the copy is sorted,
141     and then used to calculate the median.
142
143     Function is a non-mutable function, i.e., \a first and \a last
144     can be const_iterators.
145     
146     Requirements: RandomAccessIterator is an iterator over a range of
147     doubles (or any type being convertable to double).
148     
149     @return \a p'th percentile of range
150
151     \deprecated percentile2 will replace this function in the future
152
153     \note the definition of percentile used here is not identical to
154     that one used in percentile2 and Percentile. The difference is
155     smaller for large ranges.
156  */
157  template <class RandomAccessIterator>
158  double percentile(RandomAccessIterator first, RandomAccessIterator last, 
159                    double p, bool sorted=false) YAT_DEPRECATE;
160 
161
162  /**
163     \see Percentiler
164     
165     \since new in yat 0.5
166   */
167  template <class RandomAccessIterator>
168  double percentile2(RandomAccessIterator first, RandomAccessIterator last, 
169                     double p, bool sorted=false)
170  {
171    Percentiler percentiler(p, sorted);
172    return percentiler(first, last);
173  }
174
175  ///
176  /// @brief Computes the skewness of the data in a vector.
177  ///
178  /// The skewness measures the asymmetry of the tails of a
179  /// distribution.
180  ///
181  double skewness(const utility::VectorBase&);
182 
183
184  //////// template implementations ///////////
185
186  template <typename T, typename ForwardIterator>
187  void add(T& o, ForwardIterator first, ForwardIterator last,
188           const classifier::Target& target)
189  {
190    for (size_t i=0; first!=last; ++i, ++first)
191      o.add(utility::iterator_traits<ForwardIterator>().data(first),
192            target.binary(i), 
193            utility::iterator_traits<ForwardIterator>().weight(first));
194  } 
195
196
197  template <class T>
198  double mad(T first, T last, const bool sorted=false)
199  {
200    double m = median(first, last, sorted);
201    typedef std::vector<typename std::iterator_traits<T>::value_type> Vec;
202    Vec ad;
203    ad.reserve(std::distance(first, last));
204    // copy values (and weights if T is weighted)
205    std::copy(first, last, std::back_insert_iterator<Vec>(ad));
206    utility::iterator_traits<typename Vec::iterator> traits;
207    for (typename Vec::iterator i=ad.begin() ; i!=ad.end(); ++i) {
208      // adjust data
209      traits.data(i) = std::abs(traits.data(i)-m);
210    }
211    std::sort(ad.begin(), ad.end());
212    return median(ad.begin(), ad.end(), true);
213  }
214 
215
216  template <class T> 
217  double median(T first, T last, const bool sorted=false) 
218  { 
219    return percentile2(first, last, 50.0, sorted); 
220  }
221
222
223  template <class RandomAccessIterator>
224  double percentile(RandomAccessIterator first, RandomAccessIterator last, 
225                    double p, bool sorted=false)
226  {
227    // range is one value only is a special case
228    if (first+1 == last)
229      return utility::iterator_traits<RandomAccessIterator>().data(first);
230    if (sorted) {
231      // have to take care of this special case
232      if (p>=100)
233        return utility::iterator_traits<RandomAccessIterator>().data(--last);
234      double j = p/100 * (std::distance(first,last)-1);
235      int i = static_cast<int>(j);
236      return (1-j+floor(j))*first[i] + (j-floor(j))*first[i+1];
237    }
238
239    std::vector<typename std::iterator_traits<RandomAccessIterator>::value_type> 
240      v_copy;
241    v_copy.reserve(std::distance(first,last));
242    std::copy(first, last, std::back_inserter(v_copy));
243    size_t i = static_cast<size_t>(p/100 * (v_copy.size()-1));
244    if (i+2 < v_copy.size()) {
245      std::partial_sort(v_copy.begin(), v_copy.begin()+i+2, v_copy.end());
246    }
247    else
248      std::sort(v_copy.begin(), v_copy.end());
249    return percentile(v_copy.begin(), v_copy.end(), p, true);
250  }
251
252
253}}} // of namespace statistics, yat, and theplu
254
255#endif
Note: See TracBrowser for help on using the repository browser.