source: trunk/yat/statistics/utility.h @ 1437

Last change on this file since 1437 was 1437, checked in by Peter, 13 years ago

merge patch release 0.4.2 to trunk. Delta 0.4.2-0.4.1

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 7.4 KB
Line 
1#ifndef _theplu_yat_statistics_utility_
2#define _theplu_yat_statistics_utility_
3
4// $Id: utility.h 1437 2008-08-25 17:55:00Z peter $
5
6/*
7  Copyright (C) 2004 Jari Häkkinen, Peter Johansson
8  Copyright (C) 2005 Peter Johansson
9  Copyright (C) 2006 Jari Häkkinen, Peter Johansson, Markus Ringnér
10  Copyright (C) 2007 Jari Häkkinen, Peter Johansson
11  Copyright (C) 2008 Peter Johansson
12
13  This file is part of the yat library, http://dev.thep.lu.se/yat
14
15  The yat library is free software; you can redistribute it and/or
16  modify it under the terms of the GNU General Public License as
17  published by the Free Software Foundation; either version 2 of the
18  License, or (at your option) any later version.
19
20  The yat library is distributed in the hope that it will be useful,
21  but WITHOUT ANY WARRANTY; without even the implied warranty of
22  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
23  General Public License for more details.
24
25  You should have received a copy of the GNU General Public License
26  along with this program; if not, write to the Free Software
27  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
28  02111-1307, USA.
29*/
30
31#include "Percentiler.h"
32
33#include "yat/classifier/DataLookupWeighted1D.h"
34#include "yat/classifier/Target.h"
35#include "yat/utility/VectorBase.h"
36#include "yat/utility/yat_assert.h"
37
38#include <algorithm>
39#include <cmath>
40#include <stdexcept>
41#include <vector>
42
43#include <gsl/gsl_statistics_double.h>
44
45namespace theplu {
46namespace yat {
47namespace statistics { 
48
49  /**
50     \brief 50th percentile
51     @see Percentiler
52  */
53  template <class T> 
54  double median(T first, T last, const bool sorted=false); 
55
56  /**
57     \see Percentiler
58  */
59  template <class T>
60  double percentile(T first, T last, double p, bool sorted=false);
61 
62  /**
63     Adding a range [\a first, \a last) into an object of type T. The
64     requirements for the type T is to have an add(double, bool, double)
65     function.
66  */
67  template <typename T, typename ForwardIterator>
68  void add(T& o, ForwardIterator first, ForwardIterator last,
69           const classifier::Target& target)
70  {
71    for (size_t i=0; first!=last; ++i, ++first)
72      o.add(utility::iterator_traits<ForwardIterator>().data(first),
73            target.binary(i), 
74            utility::iterator_traits<ForwardIterator>().weight(first));
75  } 
76
77  ///
78  /// Calculates the probability to get \a k or smaller from a
79  /// hypergeometric distribution with parameters \a n1 \a n2 \a
80  /// t. Hypergeomtric situation you get in the following situation:
81  /// Let there be \a n1 ways for a "good" selection and \a n2 ways
82  /// for a "bad" selection out of a total of possibilities. Take \a
83  /// t samples without replacement and \a k of those are "good"
84  /// samples. \a k will follow a hypergeomtric distribution.
85  ///
86  /// @return cumulative hypergeomtric distribution functions P(k).
87  ///
88  /// \deprecated Provided for backward compatibility with the 0.4
89  /// API. Use gsl_cdf_hypergeometric_P
90  ///
91  double cdf_hypergeometric_P(unsigned int k, unsigned int n1, 
92                              unsigned int n2, unsigned int t);
93
94
95  /**
96     \brief one-sided p-value
97
98     This function uses the t-distribution to calculate the one-sided
99     p-value. Given that the true correlation is zero (Null
100     hypothesis) the estimated correlation, r, after a transformation
101     is t-distributed:
102
103     \f$ \sqrt{(n-2)} \frac{r}{\sqrt{(1-r^2)}} \in t(n-2) \f$
104
105     \return Probability that correlation is larger than \a r by
106     chance when having \a n samples.
107   */
108  double pearson_p_value(double r, unsigned int n);
109
110  ///
111  /// @brief Computes the kurtosis of the data in a vector.
112  ///
113  /// The kurtosis measures how sharply peaked a distribution is,
114  /// relative to its width. The kurtosis is normalized to zero for a
115  /// gaussian distribution.
116  ///
117  double kurtosis(const utility::VectorBase&);
118
119
120  ///
121  /// @brief Median absolute deviation from median
122  ///
123  /// Function is non-mutable function
124  ///
125  template <class T>
126  double mad(T first, T last, const bool sorted=false)
127  {
128    double m = median(first, last, sorted);
129    std::vector<double> ad;
130    ad.reserve(std::distance(first, last));
131    for( ; first!=last; ++first)
132      ad.push_back(fabs(*first-m));
133    std::sort(ad.begin(), ad.end());
134    return median(ad.begin(), ad.end(), true);
135  }
136 
137
138  ///
139  /// Median is defined to be value in the middle. If number of values
140  /// is even median is the average of the two middle values.  the
141  /// median value is given by p equal to 50. If \a sorted is false
142  /// (default), the range is copied, the copy is sorted, and then
143  /// used to calculate the median.
144  ///
145  /// Function is a non-mutable function, i.e., \a first and \a last
146  /// can be const_iterators.
147  ///
148  /// Requirements: T should be an iterator over a range of doubles (or
149  /// any type being convertable to double).
150  ///
151  /// @return median of range
152  ///
153  template <class T> 
154  double median(T first, T last, const bool sorted=false) 
155  { return percentile2(first, last, 50.0, sorted); }
156
157  /**
158     The percentile is determined by the \a p, a number between 0 and
159     100. The percentile is found by interpolation, using the formula
160     \f$ percentile = (1 - \delta) x_i + \delta x_{i+1} \f$ where \a
161     p is floor\f$((n - 1)p/100)\f$ and \f$ \delta \f$ is \f$
162     (n-1)p/100 - i \f$.Thus the minimum value of the vector is given
163     by p equal to zero, the maximum is given by p equal to 100 and
164     the median value is given by p equal to 50. If @a sorted
165     is false (default), the vector is copied, the copy is sorted,
166     and then used to calculate the median.
167
168     Function is a non-mutable function, i.e., \a first and \a last
169     can be const_iterators.
170     
171     Requirements: T should be an iterator over a range of doubles (or
172     any type being convertable to double).
173     
174     @return \a p'th percentile of range
175
176     \deprecated percentile2 will replace this function in the future
177
178     \note the definition of percentile used here is not identical to
179     that one used in percentile2 and Percentile. The difference is
180     smaller for large ranges.
181  */
182  template <class RandomAccessIterator>
183  double percentile(RandomAccessIterator first, RandomAccessIterator last, 
184                    double p, bool sorted=false)
185  {
186    // range is one value only is a special case
187    if (first+1 == last)
188      return utility::iterator_traits<RandomAccessIterator>().data(first);
189    if (sorted) {
190      // have to take care of this special case
191      if (p>=100)
192        return utility::iterator_traits<RandomAccessIterator>().data(--last);
193      double j = p/100 * (std::distance(first,last)-1);
194      int i = static_cast<int>(j);
195      return (1-j+floor(j))*first[i] + (j-floor(j))*first[i+1];
196    }
197
198    std::vector<typename std::iterator_traits<RandomAccessIterator>::value_type> 
199      v_copy;
200    v_copy.reserve(std::distance(first,last));
201    std::copy(first, last, std::back_inserter(v_copy));
202    size_t i = static_cast<size_t>(p/100 * (v_copy.size()-1));
203    if (i+2 < v_copy.size()) {
204      std::partial_sort(v_copy.begin(), v_copy.begin()+i+2, v_copy.end());
205    }
206    else
207      std::sort(v_copy.begin(), v_copy.end());
208    return percentile(v_copy.begin(), v_copy.end(), p, true);
209  }
210
211
212  /**
213     \see Percentiler
214     
215     \since new in yat 0.5
216   */
217  template <class RandomAccessIterator>
218  double percentile2(RandomAccessIterator first, RandomAccessIterator last, 
219                     double p, bool sorted=false)
220  {
221    Percentiler percentiler(p, sorted);
222    return percentiler(first, last);
223  }
224
225  ///
226  /// @brief Computes the skewness of the data in a vector.
227  ///
228  /// The skewness measures the asymmetry of the tails of a
229  /// distribution.
230  ///
231  double skewness(const utility::VectorBase&);
232 
233}}} // of namespace statistics, yat, and theplu
234
235#endif
Note: See TracBrowser for help on using the repository browser.