source: trunk/yat/statistics/utility.h @ 1835

Last change on this file since 1835 was 1835, checked in by Peter, 13 years ago

fixes #399 - allow weighted iterators in mad

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 7.7 KB
Line 
1#ifndef _theplu_yat_statistics_utility_
2#define _theplu_yat_statistics_utility_
3
4// $Id: utility.h 1835 2009-02-25 23:41:15Z peter $
5
6/*
7  Copyright (C) 2004 Jari Häkkinen, Peter Johansson
8  Copyright (C) 2005 Peter Johansson
9  Copyright (C) 2006 Jari Häkkinen, Peter Johansson, Markus Ringnér
10  Copyright (C) 2007, 2008 Jari Häkkinen, Peter Johansson
11  Copyright (C) 2009 Peter Johansson
12
13  This file is part of the yat library, http://dev.thep.lu.se/yat
14
15  The yat library is free software; you can redistribute it and/or
16  modify it under the terms of the GNU General Public License as
17  published by the Free Software Foundation; either version 3 of the
18  License, or (at your option) any later version.
19
20  The yat library is distributed in the hope that it will be useful,
21  but WITHOUT ANY WARRANTY; without even the implied warranty of
22  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
23  General Public License for more details.
24
25  You should have received a copy of the GNU General Public License
26  along with yat. If not, see <http://www.gnu.org/licenses/>.
27*/
28
29#include "Percentiler.h"
30
31#include "yat/classifier/DataLookupWeighted1D.h"
32#include "yat/classifier/Target.h"
33#include "yat/utility/deprecate.h"
34#include "yat/utility/iterator_traits.h"
35#include "yat/utility/VectorBase.h"
36#include "yat/utility/yat_assert.h"
37
38#include <algorithm>
39#include <cmath>
40#include <stdexcept>
41#include <vector>
42
43#include <gsl/gsl_statistics_double.h>
44
45namespace theplu {
46namespace yat {
47namespace statistics { 
48
49  /**
50     \brief 50th percentile
51     @see Percentiler
52  */
53  template <class T> 
54  double median(T first, T last, const bool sorted=false); 
55
56  /**
57     \see Percentiler
58  */
59  template <class T>
60  double percentile(T first, T last, double p, bool sorted=false) YAT_DEPRECATE;
61 
62  /**
63     Adding a range [\a first, \a last) into an object of type T. The
64     requirements for the type T is to have an add(double, bool, double)
65     function.
66  */
67  template <typename T, typename ForwardIterator>
68  void add(T& o, ForwardIterator first, ForwardIterator last,
69           const classifier::Target& target)
70  {
71    for (size_t i=0; first!=last; ++i, ++first)
72      o.add(utility::iterator_traits<ForwardIterator>().data(first),
73            target.binary(i), 
74            utility::iterator_traits<ForwardIterator>().weight(first));
75  } 
76
77  ///
78  /// Calculates the probability to get \a k or smaller from a
79  /// hypergeometric distribution with parameters \a n1 \a n2 \a
80  /// t. Hypergeomtric situation you get in the following situation:
81  /// Let there be \a n1 ways for a "good" selection and \a n2 ways
82  /// for a "bad" selection out of a total of possibilities. Take \a
83  /// t samples without replacement and \a k of those are "good"
84  /// samples. \a k will follow a hypergeomtric distribution.
85  ///
86  /// @return cumulative hypergeomtric distribution functions P(k).
87  ///
88  double cdf_hypergeometric_P(unsigned int k, unsigned int n1, 
89                              unsigned int n2, unsigned int t);
90
91
92  /**
93     \brief one-sided p-value
94
95     This function uses the t-distribution to calculate the one-sided
96     p-value. Given that the true correlation is zero (Null
97     hypothesis) the estimated correlation, r, after a transformation
98     is t-distributed:
99
100     \f$ \sqrt{(n-2)} \frac{r}{\sqrt{(1-r^2)}} \in t(n-2) \f$
101
102     \return Probability that correlation is larger than \a r by
103     chance when having \a n samples. For \a r larger or equal to 1.0,
104     0.0 is returned. For \a r smaller or equal to -1.0, 1.0 is
105     returned.
106   */
107  double pearson_p_value(double r, unsigned int n);
108
109  ///
110  /// @brief Computes the kurtosis of the data in a vector.
111  ///
112  /// The kurtosis measures how sharply peaked a distribution is,
113  /// relative to its width. The kurtosis is normalized to zero for a
114  /// gaussian distribution.
115  ///
116  double kurtosis(const utility::VectorBase&);
117
118
119  ///
120  /// @brief Median absolute deviation from median
121  ///
122  /// Function is non-mutable function
123  ///
124  template <class T>
125  double mad(T first, T last, const bool sorted=false)
126  {
127    double m = median(first, last, sorted);
128
129    typedef std::vector<typename std::iterator_traits<T>::value_type> Vec;
130    Vec ad;
131    ad.reserve(std::distance(first, last));
132    // copy values (and weights if T is weighted)
133    std::copy(first, last, std::back_insert_iterator<Vec>(ad));
134    utility::iterator_traits<typename Vec::iterator> traits;
135    for (typename Vec::iterator i=ad.begin() ; i!=ad.end(); ++i) {
136      // adjust data
137      traits.data(i) = std::abs(traits.data(i)-m);
138    }
139    std::sort(ad.begin(), ad.end());
140    return median(ad.begin(), ad.end(), true);
141  }
142 
143
144  ///
145  /// Median is defined to be value in the middle. If number of values
146  /// is even median is the average of the two middle values.  the
147  /// median value is given by p equal to 50. If \a sorted is false
148  /// (default), the range is copied, the copy is sorted, and then
149  /// used to calculate the median.
150  ///
151  /// Function is a non-mutable function, i.e., \a first and \a last
152  /// can be const_iterators.
153  ///
154  /// Requirements: T should be an iterator over a range of doubles (or
155  /// any type being convertable to double).
156  ///
157  /// @return median of range
158  ///
159  template <class T> 
160  double median(T first, T last, const bool sorted=false) 
161  { return percentile2(first, last, 50.0, sorted); }
162
163  /**
164     The percentile is determined by the \a p, a number between 0 and
165     100. The percentile is found by interpolation, using the formula
166     \f$ percentile = (1 - \delta) x_i + \delta x_{i+1} \f$ where \a
167     p is floor\f$((n - 1)p/100)\f$ and \f$ \delta \f$ is \f$
168     (n-1)p/100 - i \f$.Thus the minimum value of the vector is given
169     by p equal to zero, the maximum is given by p equal to 100 and
170     the median value is given by p equal to 50. If @a sorted
171     is false (default), the vector is copied, the copy is sorted,
172     and then used to calculate the median.
173
174     Function is a non-mutable function, i.e., \a first and \a last
175     can be const_iterators.
176     
177     Requirements: T should be an iterator over a range of doubles (or
178     any type being convertable to double).
179     
180     @return \a p'th percentile of range
181
182     \deprecated percentile2 will replace this function in the future
183
184     \note the definition of percentile used here is not identical to
185     that one used in percentile2 and Percentile. The difference is
186     smaller for large ranges.
187  */
188  template <class RandomAccessIterator>
189  double percentile(RandomAccessIterator first, RandomAccessIterator last, 
190                    double p, bool sorted=false)
191  {
192    // range is one value only is a special case
193    if (first+1 == last)
194      return utility::iterator_traits<RandomAccessIterator>().data(first);
195    if (sorted) {
196      // have to take care of this special case
197      if (p>=100)
198        return utility::iterator_traits<RandomAccessIterator>().data(--last);
199      double j = p/100 * (std::distance(first,last)-1);
200      int i = static_cast<int>(j);
201      return (1-j+floor(j))*first[i] + (j-floor(j))*first[i+1];
202    }
203
204    std::vector<typename std::iterator_traits<RandomAccessIterator>::value_type> 
205      v_copy;
206    v_copy.reserve(std::distance(first,last));
207    std::copy(first, last, std::back_inserter(v_copy));
208    size_t i = static_cast<size_t>(p/100 * (v_copy.size()-1));
209    if (i+2 < v_copy.size()) {
210      std::partial_sort(v_copy.begin(), v_copy.begin()+i+2, v_copy.end());
211    }
212    else
213      std::sort(v_copy.begin(), v_copy.end());
214    return percentile(v_copy.begin(), v_copy.end(), p, true);
215  }
216
217
218  /**
219     \see Percentiler
220     
221     \since new in yat 0.5
222   */
223  template <class RandomAccessIterator>
224  double percentile2(RandomAccessIterator first, RandomAccessIterator last, 
225                     double p, bool sorted=false)
226  {
227    Percentiler percentiler(p, sorted);
228    return percentiler(first, last);
229  }
230
231  ///
232  /// @brief Computes the skewness of the data in a vector.
233  ///
234  /// The skewness measures the asymmetry of the tails of a
235  /// distribution.
236  ///
237  double skewness(const utility::VectorBase&);
238 
239}}} // of namespace statistics, yat, and theplu
240
241#endif
Note: See TracBrowser for help on using the repository browser.