source: trunk/yat/statistics/utility.h @ 1701

Last change on this file since 1701 was 1553, checked in by Peter, 13 years ago

fixes #450

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 7.4 KB
Line 
1#ifndef _theplu_yat_statistics_utility_
2#define _theplu_yat_statistics_utility_
3
4// $Id: utility.h 1553 2008-10-07 16:06:50Z peter $
5
6/*
7  Copyright (C) 2004 Jari Häkkinen, Peter Johansson
8  Copyright (C) 2005 Peter Johansson
9  Copyright (C) 2006 Jari Häkkinen, Peter Johansson, Markus Ringnér
10  Copyright (C) 2007 Jari Häkkinen, Peter Johansson
11  Copyright (C) 2008 Peter Johansson
12
13  This file is part of the yat library, http://dev.thep.lu.se/yat
14
15  The yat library is free software; you can redistribute it and/or
16  modify it under the terms of the GNU General Public License as
17  published by the Free Software Foundation; either version 3 of the
18  License, or (at your option) any later version.
19
20  The yat library is distributed in the hope that it will be useful,
21  but WITHOUT ANY WARRANTY; without even the implied warranty of
22  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
23  General Public License for more details.
24
25  You should have received a copy of the GNU General Public License
26  along with yat. If not, see <http://www.gnu.org/licenses/>.
27*/
28
29#include "Percentiler.h"
30
31#include "yat/classifier/DataLookupWeighted1D.h"
32#include "yat/classifier/Target.h"
33#include "yat/utility/deprecate.h"
34#include "yat/utility/VectorBase.h"
35#include "yat/utility/yat_assert.h"
36
37#include <algorithm>
38#include <cmath>
39#include <stdexcept>
40#include <vector>
41
42#include <gsl/gsl_statistics_double.h>
43
44namespace theplu {
45namespace yat {
46namespace statistics { 
47
48  /**
49     \brief 50th percentile
50     @see Percentiler
51  */
52  template <class T> 
53  double median(T first, T last, const bool sorted=false); 
54
55  /**
56     \see Percentiler
57  */
58  template <class T>
59  double percentile(T first, T last, double p, bool sorted=false) YAT_DEPRECATE;
60 
61  /**
62     Adding a range [\a first, \a last) into an object of type T. The
63     requirements for the type T is to have an add(double, bool, double)
64     function.
65  */
66  template <typename T, typename ForwardIterator>
67  void add(T& o, ForwardIterator first, ForwardIterator last,
68           const classifier::Target& target)
69  {
70    for (size_t i=0; first!=last; ++i, ++first)
71      o.add(utility::iterator_traits<ForwardIterator>().data(first),
72            target.binary(i), 
73            utility::iterator_traits<ForwardIterator>().weight(first));
74  } 
75
76  ///
77  /// Calculates the probability to get \a k or smaller from a
78  /// hypergeometric distribution with parameters \a n1 \a n2 \a
79  /// t. Hypergeomtric situation you get in the following situation:
80  /// Let there be \a n1 ways for a "good" selection and \a n2 ways
81  /// for a "bad" selection out of a total of possibilities. Take \a
82  /// t samples without replacement and \a k of those are "good"
83  /// samples. \a k will follow a hypergeomtric distribution.
84  ///
85  /// @return cumulative hypergeomtric distribution functions P(k).
86  ///
87  double cdf_hypergeometric_P(unsigned int k, unsigned int n1, 
88                              unsigned int n2, unsigned int t);
89
90
91  /**
92     \brief one-sided p-value
93
94     This function uses the t-distribution to calculate the one-sided
95     p-value. Given that the true correlation is zero (Null
96     hypothesis) the estimated correlation, r, after a transformation
97     is t-distributed:
98
99     \f$ \sqrt{(n-2)} \frac{r}{\sqrt{(1-r^2)}} \in t(n-2) \f$
100
101     \return Probability that correlation is larger than \a r by
102     chance when having \a n samples. For \a r larger or equal to 1.0,
103     0.0 is returned. For \a r smaller or equal to -1.0, 1.0 is
104     returned.
105   */
106  double pearson_p_value(double r, unsigned int n);
107
108  ///
109  /// @brief Computes the kurtosis of the data in a vector.
110  ///
111  /// The kurtosis measures how sharply peaked a distribution is,
112  /// relative to its width. The kurtosis is normalized to zero for a
113  /// gaussian distribution.
114  ///
115  double kurtosis(const utility::VectorBase&);
116
117
118  ///
119  /// @brief Median absolute deviation from median
120  ///
121  /// Function is non-mutable function
122  ///
123  template <class T>
124  double mad(T first, T last, const bool sorted=false)
125  {
126    double m = median(first, last, sorted);
127    std::vector<double> ad;
128    ad.reserve(std::distance(first, last));
129    for( ; first!=last; ++first)
130      ad.push_back(fabs(*first-m));
131    std::sort(ad.begin(), ad.end());
132    return median(ad.begin(), ad.end(), true);
133  }
134 
135
136  ///
137  /// Median is defined to be value in the middle. If number of values
138  /// is even median is the average of the two middle values.  the
139  /// median value is given by p equal to 50. If \a sorted is false
140  /// (default), the range is copied, the copy is sorted, and then
141  /// used to calculate the median.
142  ///
143  /// Function is a non-mutable function, i.e., \a first and \a last
144  /// can be const_iterators.
145  ///
146  /// Requirements: T should be an iterator over a range of doubles (or
147  /// any type being convertable to double).
148  ///
149  /// @return median of range
150  ///
151  template <class T> 
152  double median(T first, T last, const bool sorted=false) 
153  { return percentile2(first, last, 50.0, sorted); }
154
155  /**
156     The percentile is determined by the \a p, a number between 0 and
157     100. The percentile is found by interpolation, using the formula
158     \f$ percentile = (1 - \delta) x_i + \delta x_{i+1} \f$ where \a
159     p is floor\f$((n - 1)p/100)\f$ and \f$ \delta \f$ is \f$
160     (n-1)p/100 - i \f$.Thus the minimum value of the vector is given
161     by p equal to zero, the maximum is given by p equal to 100 and
162     the median value is given by p equal to 50. If @a sorted
163     is false (default), the vector is copied, the copy is sorted,
164     and then used to calculate the median.
165
166     Function is a non-mutable function, i.e., \a first and \a last
167     can be const_iterators.
168     
169     Requirements: T should be an iterator over a range of doubles (or
170     any type being convertable to double).
171     
172     @return \a p'th percentile of range
173
174     \deprecated percentile2 will replace this function in the future
175
176     \note the definition of percentile used here is not identical to
177     that one used in percentile2 and Percentile. The difference is
178     smaller for large ranges.
179  */
180  template <class RandomAccessIterator>
181  double percentile(RandomAccessIterator first, RandomAccessIterator last, 
182                    double p, bool sorted=false)
183  {
184    // range is one value only is a special case
185    if (first+1 == last)
186      return utility::iterator_traits<RandomAccessIterator>().data(first);
187    if (sorted) {
188      // have to take care of this special case
189      if (p>=100)
190        return utility::iterator_traits<RandomAccessIterator>().data(--last);
191      double j = p/100 * (std::distance(first,last)-1);
192      int i = static_cast<int>(j);
193      return (1-j+floor(j))*first[i] + (j-floor(j))*first[i+1];
194    }
195
196    std::vector<typename std::iterator_traits<RandomAccessIterator>::value_type> 
197      v_copy;
198    v_copy.reserve(std::distance(first,last));
199    std::copy(first, last, std::back_inserter(v_copy));
200    size_t i = static_cast<size_t>(p/100 * (v_copy.size()-1));
201    if (i+2 < v_copy.size()) {
202      std::partial_sort(v_copy.begin(), v_copy.begin()+i+2, v_copy.end());
203    }
204    else
205      std::sort(v_copy.begin(), v_copy.end());
206    return percentile(v_copy.begin(), v_copy.end(), p, true);
207  }
208
209
210  /**
211     \see Percentiler
212     
213     \since new in yat 0.5
214   */
215  template <class RandomAccessIterator>
216  double percentile2(RandomAccessIterator first, RandomAccessIterator last, 
217                     double p, bool sorted=false)
218  {
219    Percentiler percentiler(p, sorted);
220    return percentiler(first, last);
221  }
222
223  ///
224  /// @brief Computes the skewness of the data in a vector.
225  ///
226  /// The skewness measures the asymmetry of the tails of a
227  /// distribution.
228  ///
229  double skewness(const utility::VectorBase&);
230 
231}}} // of namespace statistics, yat, and theplu
232
233#endif
Note: See TracBrowser for help on using the repository browser.