source: trunk/yat/statistics/utility.h @ 1874

Last change on this file since 1874 was 1874, checked in by Peter, 14 years ago

merge patch release 0.5.2 into trunk. Delta 0.5.2 - 0.5.1

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 8.0 KB
Line 
1#ifndef _theplu_yat_statistics_utility_
2#define _theplu_yat_statistics_utility_
3
4// $Id: utility.h 1874 2009-03-17 22:09:01Z peter $
5
6/*
7  Copyright (C) 2004 Jari Häkkinen, Peter Johansson
8  Copyright (C) 2005 Peter Johansson
9  Copyright (C) 2006 Jari Häkkinen, Peter Johansson, Markus Ringnér
10  Copyright (C) 2007, 2008 Jari Häkkinen, Peter Johansson
11  Copyright (C) 2009 Peter Johansson
12
13  This file is part of the yat library, http://dev.thep.lu.se/yat
14
15  The yat library is free software; you can redistribute it and/or
16  modify it under the terms of the GNU General Public License as
17  published by the Free Software Foundation; either version 3 of the
18  License, or (at your option) any later version.
19
20  The yat library is distributed in the hope that it will be useful,
21  but WITHOUT ANY WARRANTY; without even the implied warranty of
22  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
23  General Public License for more details.
24
25  You should have received a copy of the GNU General Public License
26  along with yat. If not, see <http://www.gnu.org/licenses/>.
27*/
28
29#include "Percentiler.h"
30
31#include "yat/classifier/DataLookupWeighted1D.h"
32#include "yat/classifier/Target.h"
33#include "yat/utility/deprecate.h"
34#include "yat/utility/iterator_traits.h"
35#include "yat/utility/VectorBase.h"
36#include "yat/utility/yat_assert.h"
37
38#include <algorithm>
39#include <cmath>
40#include <stdexcept>
41#include <vector>
42
43#include <gsl/gsl_statistics_double.h>
44
45namespace theplu {
46namespace yat {
47namespace statistics { 
48
49  /**
50     Adding a range [\a first, \a last) into an object of type T. The
51     requirements for the type T is to have an add(double, bool, double)
52     function.
53  */
54  template <typename T, typename ForwardIterator>
55  void add(T& o, ForwardIterator first, ForwardIterator last,
56           const classifier::Target& target);
57
58  ///
59  /// Calculates the probability to get \a k or smaller from a
60  /// hypergeometric distribution with parameters \a n1 \a n2 \a
61  /// t. Hypergeomtric situation you get in the following situation:
62  /// Let there be \a n1 ways for a "good" selection and \a n2 ways
63  /// for a "bad" selection out of a total of possibilities. Take \a
64  /// t samples without replacement and \a k of those are "good"
65  /// samples. \a k will follow a hypergeomtric distribution.
66  ///
67  /// @return cumulative hypergeomtric distribution functions P(k).
68  ///
69  double cdf_hypergeometric_P(unsigned int k, unsigned int n1, 
70                              unsigned int n2, unsigned int t);
71
72
73  /**
74     \brief one-sided p-value
75
76     This function uses the t-distribution to calculate the one-sided
77     p-value. Given that the true correlation is zero (Null
78     hypothesis) the estimated correlation, r, after a transformation
79     is t-distributed:
80
81     \f$ \sqrt{(n-2)} \frac{r}{\sqrt{(1-r^2)}} \in t(n-2) \f$
82
83     \return Probability that correlation is larger than \a r by
84     chance when having \a n samples. For \a r larger or equal to 1.0,
85     0.0 is returned. For \a r smaller or equal to -1.0, 1.0 is
86     returned.
87   */
88  double pearson_p_value(double r, unsigned int n);
89
90  ///
91  /// @brief Computes the kurtosis of the data in a vector.
92  ///
93  /// The kurtosis measures how sharply peaked a distribution is,
94  /// relative to its width. The kurtosis is normalized to zero for a
95  /// gaussian distribution.
96  ///
97  double kurtosis(const utility::VectorBase&);
98
99
100  ///
101  /// @brief Median absolute deviation from median
102  ///
103  /// Function is non-mutable function
104  ///
105  template <class T>
106  double mad(T first, T last, const bool sorted=false);
107
108
109  ///
110  /// Median is defined to be value in the middle. If number of values
111  /// is even median is the average of the two middle values.  the
112  /// median value is given by p equal to 50. If \a sorted is false
113  /// (default), the range is copied, the copy is sorted, and then
114  /// used to calculate the median.
115  ///
116  /// Function is a non-mutable function, i.e., \a first and \a last
117  /// can be const_iterators.
118  ///
119  /// Requirements: T should be an iterator over a range of doubles (or
120  /// any type being convertable to double).
121  ///
122  /// @return median of range
123  ///
124  template <class T> 
125  double median(T first, T last, const bool sorted=false); 
126
127
128  /**
129     The percentile is determined by the \a p, a number between 0 and
130     100. The percentile is found by interpolation, using the formula
131     \f$ percentile = (1 - \delta) x_i + \delta x_{i+1} \f$ where \a
132     p is floor\f$((n - 1)p/100)\f$ and \f$ \delta \f$ is \f$
133     (n-1)p/100 - i \f$.Thus the minimum value of the vector is given
134     by p equal to zero, the maximum is given by p equal to 100 and
135     the median value is given by p equal to 50. If @a sorted
136     is false (default), the vector is copied, the copy is sorted,
137     and then used to calculate the median.
138
139     Function is a non-mutable function, i.e., \a first and \a last
140     can be const_iterators.
141     
142     Requirements: RandomAccessIterator is an iterator over a range of
143     doubles (or any type being convertable to double).
144     
145     @return \a p'th percentile of range
146
147     \deprecated percentile2 will replace this function in the future
148
149     \note the definition of percentile used here is not identical to
150     that one used in percentile2 and Percentile. The difference is
151     smaller for large ranges.
152  */
153  template <class RandomAccessIterator>
154  double percentile(RandomAccessIterator first, RandomAccessIterator last, 
155                    double p, bool sorted=false) YAT_DEPRECATE;
156 
157
158  /**
159     \see Percentiler
160     
161     \since new in yat 0.5
162   */
163  template <class RandomAccessIterator>
164  double percentile2(RandomAccessIterator first, RandomAccessIterator last, 
165                     double p, bool sorted=false)
166  {
167    Percentiler percentiler(p, sorted);
168    return percentiler(first, last);
169  }
170
171  ///
172  /// @brief Computes the skewness of the data in a vector.
173  ///
174  /// The skewness measures the asymmetry of the tails of a
175  /// distribution.
176  ///
177  double skewness(const utility::VectorBase&);
178 
179
180  //////// template implementations ///////////
181
182  template <typename T, typename ForwardIterator>
183  void add(T& o, ForwardIterator first, ForwardIterator last,
184           const classifier::Target& target)
185  {
186    for (size_t i=0; first!=last; ++i, ++first)
187      o.add(utility::iterator_traits<ForwardIterator>().data(first),
188            target.binary(i), 
189            utility::iterator_traits<ForwardIterator>().weight(first));
190  } 
191
192
193  template <class T>
194  double mad(T first, T last, const bool sorted=false)
195  {
196    double m = median(first, last, sorted);
197    typedef std::vector<typename std::iterator_traits<T>::value_type> Vec;
198    Vec ad;
199    ad.reserve(std::distance(first, last));
200    // copy values (and weights if T is weighted)
201    std::copy(first, last, std::back_insert_iterator<Vec>(ad));
202    utility::iterator_traits<typename Vec::iterator> traits;
203    for (typename Vec::iterator i=ad.begin() ; i!=ad.end(); ++i) {
204      // adjust data
205      traits.data(i) = std::abs(traits.data(i)-m);
206    }
207    std::sort(ad.begin(), ad.end());
208    return median(ad.begin(), ad.end(), true);
209  }
210 
211
212  template <class T> 
213  double median(T first, T last, const bool sorted=false) 
214  { 
215    return percentile2(first, last, 50.0, sorted); 
216  }
217
218
219  template <class RandomAccessIterator>
220  double percentile(RandomAccessIterator first, RandomAccessIterator last, 
221                    double p, bool sorted=false)
222  {
223    // range is one value only is a special case
224    if (first+1 == last)
225      return utility::iterator_traits<RandomAccessIterator>().data(first);
226    if (sorted) {
227      // have to take care of this special case
228      if (p>=100)
229        return utility::iterator_traits<RandomAccessIterator>().data(--last);
230      double j = p/100 * (std::distance(first,last)-1);
231      int i = static_cast<int>(j);
232      return (1-j+floor(j))*first[i] + (j-floor(j))*first[i+1];
233    }
234
235    std::vector<typename std::iterator_traits<RandomAccessIterator>::value_type> 
236      v_copy;
237    v_copy.reserve(std::distance(first,last));
238    std::copy(first, last, std::back_inserter(v_copy));
239    size_t i = static_cast<size_t>(p/100 * (v_copy.size()-1));
240    if (i+2 < v_copy.size()) {
241      std::partial_sort(v_copy.begin(), v_copy.begin()+i+2, v_copy.end());
242    }
243    else
244      std::sort(v_copy.begin(), v_copy.end());
245    return percentile(v_copy.begin(), v_copy.end(), p, true);
246  }
247
248
249}}} // of namespace statistics, yat, and theplu
250
251#endif
Note: See TracBrowser for help on using the repository browser.