source: trunk/yat/statistics/utility.h @ 1305

Last change on this file since 1305 was 1305, checked in by Peter, 13 years ago

Adding a functor that calculate average of a range. Also added a metafunction that return Averager or AveragerWeighted? depending on context.

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 7.4 KB
Line 
1#ifndef _theplu_yat_statistics_utility_
2#define _theplu_yat_statistics_utility_
3
4// $Id: utility.h 1305 2008-05-14 22:25:54Z peter $
5
6/*
7  Copyright (C) 2004 Jari Häkkinen, Peter Johansson
8  Copyright (C) 2005 Peter Johansson
9  Copyright (C) 2006 Jari Häkkinen, Peter Johansson, Markus Ringnér
10  Copyright (C) 2007 Jari Häkkinen, Peter Johansson
11  Copyright (C) 2008 Peter Johansson
12
13  This file is part of the yat library, http://trac.thep.lu.se/yat
14
15  The yat library is free software; you can redistribute it and/or
16  modify it under the terms of the GNU General Public License as
17  published by the Free Software Foundation; either version 2 of the
18  License, or (at your option) any later version.
19
20  The yat library is distributed in the hope that it will be useful,
21  but WITHOUT ANY WARRANTY; without even the implied warranty of
22  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
23  General Public License for more details.
24
25  You should have received a copy of the GNU General Public License
26  along with this program; if not, write to the Free Software
27  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
28  02111-1307, USA.
29*/
30
31#include "averager_traits.h"
32#include "yat/classifier/DataLookupWeighted1D.h"
33#include "yat/classifier/Target.h"
34#include "yat/utility/VectorBase.h"
35#include "yat/utility/yat_assert.h"
36
37#include <algorithm>
38#include <cmath>
39#include <stdexcept>
40#include <vector>
41
42#include <gsl/gsl_statistics_double.h>
43
44namespace theplu {
45namespace yat {
46namespace statistics { 
47
48  /**
49     \brief Functor to take average of a range.
50   */
51  struct Average
52  {
53    /**
54       If range is weighted an AveragerWeighted is used else a Averager.
55       
56       \return average of range
57     */
58    template<typename ForwardIterator>
59    inline double operator()(ForwardIterator first, ForwardIterator last) const
60    {
61      typename averager<ForwardIterator>::type a;
62      add(a, first, last);
63      return a.mean();
64    }
65
66  };
67
68
69
70  //forward declarations
71  template <class T> 
72  double median(T first, T last, const bool sorted=false); 
73
74  template <class T>
75  double percentile(T first, T last, double p, bool sorted=false);
76 
77  /**
78     Adding a range [\a first, \a last) into an object of type T. The
79     requirements for the type T is to have an add(double, bool, double)
80     function.
81  */
82  template <typename T, typename ForwardIterator>
83  void add(T& o, ForwardIterator first, ForwardIterator last,
84           const classifier::Target& target)
85  {
86    for (size_t i=0; first!=last; ++i, ++first)
87      o.add(utility::iterator_traits<ForwardIterator>().data(first),
88            target.binary(i), 
89            utility::iterator_traits<ForwardIterator>().weight(first));
90  } 
91
92  ///
93  /// Calculates the probability to get \a k or smaller from a
94  /// hypergeometric distribution with parameters \a n1 \a n2 \a
95  /// t. Hypergeomtric situation you get in the following situation:
96  /// Let there be \a n1 ways for a "good" selection and \a n2 ways
97  /// for a "bad" selection out of a total of possibilities. Take \a
98  /// t samples without replacement and \a k of those are "good"
99  /// samples. \a k will follow a hypergeomtric distribution.
100  ///
101  /// @return cumulative hypergeomtric distribution functions P(k).
102  ///
103  /// \deprecated
104  ///
105  double cdf_hypergeometric_P(unsigned int k, unsigned int n1, 
106                              unsigned int n2, unsigned int t);
107
108
109  /**
110     \brief one-sided p-value
111
112     This function uses the t-distribution to calculate the one-sided
113     p-value. Given that the true correlation is zero (Null
114     hypothesis) the estimated correlation, r, after a transformation
115     is t-distributed:
116
117     \f$ \sqrt{(n-2)} \frac{r}{\sqrt{(1-r^2)}} \in t(n-2) \f$
118
119     \return Probability that correlation is larger than \a r by
120     chance when having \a n samples.
121   */
122  double pearson_p_value(double r, unsigned int n);
123
124  ///
125  /// @brief Computes the kurtosis of the data in a vector.
126  ///
127  /// The kurtosis measures how sharply peaked a distribution is,
128  /// relative to its width. The kurtosis is normalized to zero for a
129  /// gaussian distribution.
130  ///
131  double kurtosis(const utility::VectorBase&);
132
133
134  ///
135  /// @brief Median absolute deviation from median
136  ///
137  /// Function is non-mutable function
138  ///
139  template <class T>
140  double mad(T first, T last, const bool sorted=false)
141  {
142    double m = median(first, last, sorted);
143    std::vector<double> ad;
144    ad.reserve(std::distance(first, last));
145    for( ; first!=last; ++first)
146      ad.push_back(fabs(*first-m));
147    std::sort(ad.begin(), ad.end());
148    return median(ad.begin(), ad.end(), true);
149  }
150 
151
152  ///
153  /// Median is defined to be value in the middle. If number of values
154  /// is even median is the average of the two middle values.  the
155  /// median value is given by p equal to 50. If \a sorted is false
156  /// (default), the range is copied, the copy is sorted, and then
157  /// used to calculate the median.
158  ///
159  /// Function is a non-mutable function, i.e., \a first and \a last
160  /// can be const_iterators.
161  ///
162  /// Requirements: T should be an iterator over a range of doubles (or
163  /// any type being convertable to double).
164  ///
165  /// @return median of range
166  ///
167  template <class T> 
168  double median(T first, T last, const bool sorted=false) 
169  { return percentile(first, last, 50.0, sorted); }
170
171  /**
172     The percentile is determined by the \a p, a number between 0 and
173     100. The percentile is found by interpolation, using the formula
174     \f$ percentile = (1 - \delta) x_i + \delta x_{i+1} \f$ where \a
175     p is floor\f$((n - 1)p/100)\f$ and \f$ \delta \f$ is \f$
176     (n-1)p/100 - i \f$.Thus the minimum value of the vector is given
177     by p equal to zero, the maximum is given by p equal to 100 and
178     the median value is given by p equal to 50. If @a sorted
179     is false (default), the vector is copied, the copy is sorted,
180     and then used to calculate the median.
181
182     Function is a non-mutable function, i.e., \a first and \a last
183     can be const_iterators.
184     
185     Requirements: T should be an iterator over a range of doubles (or
186     any type being convertable to double). If \a sorted is false
187     iterator must be mutable, else read-only iterator is also ok.
188     
189     @return \a p'th percentile of range
190  */
191  template <class T>
192  double percentile(T first, T last, double p, bool sorted=false)
193  {
194    utility::yat_assert<std::range_error>(first<last,
195                                          "percentile: invalid range");
196    utility::yat_assert<std::runtime_error>(p>=0, "percentage is negative");
197    utility::yat_assert<std::runtime_error>(p<=100, 
198                                            "percentage is larger than 100");
199    if (sorted){
200      if (p>=100)
201        return *(--last);
202      // range is one value only is a special case
203      if (std::distance(first, last)==1) 
204        return *first;
205      double j = p/100 * (std::distance(first,last)-1);
206      int i = static_cast<int>(j);
207      return (1-j+floor(j))*first[i] + (j-floor(j))*first[i+1];
208    }
209
210    std::vector<double> v_copy;
211    v_copy.reserve(std::distance(first,last));
212    std::copy(first, last, std::back_inserter(v_copy));
213    double j = p/100 * (v_copy.size()-1);
214    size_t i = static_cast<size_t>(j);
215    if (i+2 < v_copy.size()) {
216      utility::yat_assert<std::out_of_range>(i+2 < v_copy.size(), 
217                                             "in utility::percentile");
218      std::partial_sort(v_copy.begin(), v_copy.begin()+i+2, v_copy.end());
219    }
220    else
221      std::sort(v_copy.begin(), v_copy.end());
222     
223    return percentile(v_copy.begin(), v_copy.end(), p, true);
224  }
225
226  ///
227  /// @brief Computes the skewness of the data in a vector.
228  ///
229  /// The skewness measures the asymmetry of the tails of a
230  /// distribution.
231  ///
232  double skewness(const utility::VectorBase&);
233 
234}}} // of namespace statistics, yat, and theplu
235
236#endif
Note: See TracBrowser for help on using the repository browser.