source: trunk/yat/statistics/utility.h @ 1500

Last change on this file since 1500 was 1500, checked in by Peter, 13 years ago

Added structure to deprecate functions. I chose to use a gcc style,
and it is tested in configure whether the compiler supports it. If not
the deprecation has no effect. Using a deprecated function will cause
a compiler warning. In gcc the warning can be turned off with
'-Wno-deprecated'. Possibly we should turn off the warning by default,
so a user has to turn it on by defining
-DYAT_DISABLE_DEPRECATED...

fixes #367

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 7.3 KB
Line 
1#ifndef _theplu_yat_statistics_utility_
2#define _theplu_yat_statistics_utility_
3
4// $Id: utility.h 1500 2008-09-15 18:47:37Z peter $
5
6/*
7  Copyright (C) 2004 Jari Häkkinen, Peter Johansson
8  Copyright (C) 2005 Peter Johansson
9  Copyright (C) 2006 Jari Häkkinen, Peter Johansson, Markus Ringnér
10  Copyright (C) 2007 Jari Häkkinen, Peter Johansson
11  Copyright (C) 2008 Peter Johansson
12
13  This file is part of the yat library, http://dev.thep.lu.se/yat
14
15  The yat library is free software; you can redistribute it and/or
16  modify it under the terms of the GNU General Public License as
17  published by the Free Software Foundation; either version 3 of the
18  License, or (at your option) any later version.
19
20  The yat library is distributed in the hope that it will be useful,
21  but WITHOUT ANY WARRANTY; without even the implied warranty of
22  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
23  General Public License for more details.
24
25  You should have received a copy of the GNU General Public License
26  along with yat. If not, see <http://www.gnu.org/licenses/>.
27*/
28
29#include "Percentiler.h"
30
31#include "yat/classifier/DataLookupWeighted1D.h"
32#include "yat/classifier/Target.h"
33#include "yat/utility/deprecate.h"
34#include "yat/utility/VectorBase.h"
35#include "yat/utility/yat_assert.h"
36
37#include <algorithm>
38#include <cmath>
39#include <stdexcept>
40#include <vector>
41
42#include <gsl/gsl_statistics_double.h>
43
44namespace theplu {
45namespace yat {
46namespace statistics { 
47
48  /**
49     \brief 50th percentile
50     @see Percentiler
51  */
52  template <class T> 
53  double median(T first, T last, const bool sorted=false); 
54
55  /**
56     \see Percentiler
57  */
58  template <class T>
59  double percentile(T first, T last, double p, bool sorted=false) YAT_DEPRECATE;
60 
61  /**
62     Adding a range [\a first, \a last) into an object of type T. The
63     requirements for the type T is to have an add(double, bool, double)
64     function.
65  */
66  template <typename T, typename ForwardIterator>
67  void add(T& o, ForwardIterator first, ForwardIterator last,
68           const classifier::Target& target)
69  {
70    for (size_t i=0; first!=last; ++i, ++first)
71      o.add(utility::iterator_traits<ForwardIterator>().data(first),
72            target.binary(i), 
73            utility::iterator_traits<ForwardIterator>().weight(first));
74  } 
75
76  ///
77  /// Calculates the probability to get \a k or smaller from a
78  /// hypergeometric distribution with parameters \a n1 \a n2 \a
79  /// t. Hypergeomtric situation you get in the following situation:
80  /// Let there be \a n1 ways for a "good" selection and \a n2 ways
81  /// for a "bad" selection out of a total of possibilities. Take \a
82  /// t samples without replacement and \a k of those are "good"
83  /// samples. \a k will follow a hypergeomtric distribution.
84  ///
85  /// @return cumulative hypergeomtric distribution functions P(k).
86  ///
87  double cdf_hypergeometric_P(unsigned int k, unsigned int n1, 
88                              unsigned int n2, unsigned int t);
89
90
91  /**
92     \brief one-sided p-value
93
94     This function uses the t-distribution to calculate the one-sided
95     p-value. Given that the true correlation is zero (Null
96     hypothesis) the estimated correlation, r, after a transformation
97     is t-distributed:
98
99     \f$ \sqrt{(n-2)} \frac{r}{\sqrt{(1-r^2)}} \in t(n-2) \f$
100
101     \return Probability that correlation is larger than \a r by
102     chance when having \a n samples.
103   */
104  double pearson_p_value(double r, unsigned int n);
105
106  ///
107  /// @brief Computes the kurtosis of the data in a vector.
108  ///
109  /// The kurtosis measures how sharply peaked a distribution is,
110  /// relative to its width. The kurtosis is normalized to zero for a
111  /// gaussian distribution.
112  ///
113  double kurtosis(const utility::VectorBase&);
114
115
116  ///
117  /// @brief Median absolute deviation from median
118  ///
119  /// Function is non-mutable function
120  ///
121  template <class T>
122  double mad(T first, T last, const bool sorted=false)
123  {
124    double m = median(first, last, sorted);
125    std::vector<double> ad;
126    ad.reserve(std::distance(first, last));
127    for( ; first!=last; ++first)
128      ad.push_back(fabs(*first-m));
129    std::sort(ad.begin(), ad.end());
130    return median(ad.begin(), ad.end(), true);
131  }
132 
133
134  ///
135  /// Median is defined to be value in the middle. If number of values
136  /// is even median is the average of the two middle values.  the
137  /// median value is given by p equal to 50. If \a sorted is false
138  /// (default), the range is copied, the copy is sorted, and then
139  /// used to calculate the median.
140  ///
141  /// Function is a non-mutable function, i.e., \a first and \a last
142  /// can be const_iterators.
143  ///
144  /// Requirements: T should be an iterator over a range of doubles (or
145  /// any type being convertable to double).
146  ///
147  /// @return median of range
148  ///
149  template <class T> 
150  double median(T first, T last, const bool sorted=false) 
151  { return percentile2(first, last, 50.0, sorted); }
152
153  /**
154     The percentile is determined by the \a p, a number between 0 and
155     100. The percentile is found by interpolation, using the formula
156     \f$ percentile = (1 - \delta) x_i + \delta x_{i+1} \f$ where \a
157     p is floor\f$((n - 1)p/100)\f$ and \f$ \delta \f$ is \f$
158     (n-1)p/100 - i \f$.Thus the minimum value of the vector is given
159     by p equal to zero, the maximum is given by p equal to 100 and
160     the median value is given by p equal to 50. If @a sorted
161     is false (default), the vector is copied, the copy is sorted,
162     and then used to calculate the median.
163
164     Function is a non-mutable function, i.e., \a first and \a last
165     can be const_iterators.
166     
167     Requirements: T should be an iterator over a range of doubles (or
168     any type being convertable to double).
169     
170     @return \a p'th percentile of range
171
172     \deprecated percentile2 will replace this function in the future
173
174     \note the definition of percentile used here is not identical to
175     that one used in percentile2 and Percentile. The difference is
176     smaller for large ranges.
177  */
178  template <class RandomAccessIterator>
179  double percentile(RandomAccessIterator first, RandomAccessIterator last, 
180                    double p, bool sorted=false)
181  {
182    // range is one value only is a special case
183    if (first+1 == last)
184      return utility::iterator_traits<RandomAccessIterator>().data(first);
185    if (sorted) {
186      // have to take care of this special case
187      if (p>=100)
188        return utility::iterator_traits<RandomAccessIterator>().data(--last);
189      double j = p/100 * (std::distance(first,last)-1);
190      int i = static_cast<int>(j);
191      return (1-j+floor(j))*first[i] + (j-floor(j))*first[i+1];
192    }
193
194    std::vector<typename std::iterator_traits<RandomAccessIterator>::value_type> 
195      v_copy;
196    v_copy.reserve(std::distance(first,last));
197    std::copy(first, last, std::back_inserter(v_copy));
198    size_t i = static_cast<size_t>(p/100 * (v_copy.size()-1));
199    if (i+2 < v_copy.size()) {
200      std::partial_sort(v_copy.begin(), v_copy.begin()+i+2, v_copy.end());
201    }
202    else
203      std::sort(v_copy.begin(), v_copy.end());
204    return percentile(v_copy.begin(), v_copy.end(), p, true);
205  }
206
207
208  /**
209     \see Percentiler
210     
211     \since new in yat 0.5
212   */
213  template <class RandomAccessIterator>
214  double percentile2(RandomAccessIterator first, RandomAccessIterator last, 
215                     double p, bool sorted=false)
216  {
217    Percentiler percentiler(p, sorted);
218    return percentiler(first, last);
219  }
220
221  ///
222  /// @brief Computes the skewness of the data in a vector.
223  ///
224  /// The skewness measures the asymmetry of the tails of a
225  /// distribution.
226  ///
227  double skewness(const utility::VectorBase&);
228 
229}}} // of namespace statistics, yat, and theplu
230
231#endif
Note: See TracBrowser for help on using the repository browser.