source: trunk/yat/statistics/utility.h @ 1271

Last change on this file since 1271 was 1271, checked in by Peter, 13 years ago

replaced u_int with unsigned int or size_t

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 6.8 KB
Line 
1#ifndef _theplu_yat_statistics_utility_
2#define _theplu_yat_statistics_utility_
3
4// $Id: utility.h 1271 2008-04-09 16:11:07Z peter $
5
6/*
7  Copyright (C) 2004 Jari Häkkinen, Peter Johansson
8  Copyright (C) 2005 Peter Johansson
9  Copyright (C) 2006 Jari Häkkinen, Markus Ringnér, Peter Johansson
10  Copyright (C) 2007 Jari Häkkinen, Peter Johansson
11
12  This file is part of the yat library, http://trac.thep.lu.se/yat
13
14  The yat library is free software; you can redistribute it and/or
15  modify it under the terms of the GNU General Public License as
16  published by the Free Software Foundation; either version 2 of the
17  License, or (at your option) any later version.
18
19  The yat library is distributed in the hope that it will be useful,
20  but WITHOUT ANY WARRANTY; without even the implied warranty of
21  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
22  General Public License for more details.
23
24  You should have received a copy of the GNU General Public License
25  along with this program; if not, write to the Free Software
26  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
27  02111-1307, USA.
28*/
29
30#include "yat/classifier/DataLookupWeighted1D.h"
31#include "yat/classifier/Target.h"
32#include "yat/utility/VectorBase.h"
33#include "yat/utility/yat_assert.h"
34
35#include <algorithm>
36#include <cmath>
37#include <stdexcept>
38#include <vector>
39
40#include <gsl/gsl_statistics_double.h>
41
42namespace theplu {
43namespace yat {
44namespace statistics { 
45
46  //forward declarations
47  template <class T> 
48  double median(T first, T last, const bool sorted=false); 
49
50  template <class T>
51  double percentile(T first, T last, double p, bool sorted=false);
52 
53  /**
54     Adding a range [\a first, \a last) into an object of type T. The
55     requirements for the type T is to have an add(double, bool, double)
56     function.
57  */
58  template <typename T, typename ForwardIterator>
59  void add(T& o, ForwardIterator first, ForwardIterator last,
60           const classifier::Target& target)
61  {
62    for (size_t i=0; first!=last; ++i, ++first)
63      o.add(utility::iterator_traits<ForwardIterator>().data(first),
64            target.binary(i), 
65            utility::iterator_traits<ForwardIterator>().weight(first));
66  } 
67
68  ///
69  /// Calculates the probability to get \a k or smaller from a
70  /// hypergeometric distribution with parameters \a n1 \a n2 \a
71  /// t. Hypergeomtric situation you get in the following situation:
72  /// Let there be \a n1 ways for a "good" selection and \a n2 ways
73  /// for a "bad" selection out of a total of possibilities. Take \a
74  /// t samples without replacement and \a k of those are "good"
75  /// samples. \a k will follow a hypergeomtric distribution.
76  ///
77  /// @return cumulative hypergeomtric distribution functions P(k).
78  ///
79  double cdf_hypergeometric_P(unsigned int k, unsigned int n1, 
80                              unsigned int n2, unsigned int t);
81
82
83  /**
84     \brief one-sided p-value
85
86     This function uses the t-distribution to calculate the one-sided
87     p-value. Given that the true correlation is zero (Null
88     hypothesis) the estimated correlation, r, after a transformation
89     is t-distributed:
90
91     \f$ \sqrt{(n-2)} \frac{r}{\sqrt{(1-r^2)}} \in t(n-2) \f$
92
93     \return Probability that correlation is larger than \a r by
94     chance when having \a n samples.
95   */
96  double pearson_p_value(double r, unsigned int n);
97
98  ///
99  /// @brief Computes the kurtosis of the data in a vector.
100  ///
101  /// The kurtosis measures how sharply peaked a distribution is,
102  /// relative to its width. The kurtosis is normalized to zero for a
103  /// gaussian distribution.
104  ///
105  double kurtosis(const utility::VectorBase&);
106
107
108  ///
109  /// @brief Median absolute deviation from median
110  ///
111  /// Function is non-mutable function
112  ///
113  template <class T>
114  double mad(T first, T last, const bool sorted=false)
115  {
116    double m = median(first, last, sorted);
117    std::vector<double> ad;
118    ad.reserve(std::distance(first, last));
119    for( ; first!=last; ++first)
120      ad.push_back(fabs(*first-m));
121    std::sort(ad.begin(), ad.end());
122    return median(ad.begin(), ad.end(), true);
123  }
124 
125
126  ///
127  /// Median is defined to be value in the middle. If number of values
128  /// is even median is the average of the two middle values.  the
129  /// median value is given by p equal to 50. If \a sorted is false
130  /// (default), the range is copied, the copy is sorted, and then
131  /// used to calculate the median.
132  ///
133  /// Function is a non-mutable function, i.e., \a first and \a last
134  /// can be const_iterators.
135  ///
136  /// Requirements: T should be an iterator over a range of doubles (or
137  /// any type being convertable to double).
138  ///
139  /// @return median of range
140  ///
141  template <class T> 
142  double median(T first, T last, const bool sorted=false) 
143  { return percentile(first, last, 50.0, sorted); }
144
145  /**
146     The percentile is determined by the \a p, a number between 0 and
147     100. The percentile is found by interpolation, using the formula
148     \f$ percentile = (1 - \delta) x_i + \delta x_{i+1} \f$ where \a
149     p is floor\f$((n - 1)p/100)\f$ and \f$ \delta \f$ is \f$
150     (n-1)p/100 - i \f$.Thus the minimum value of the vector is given
151     by p equal to zero, the maximum is given by p equal to 100 and
152     the median value is given by p equal to 50. If @a sorted
153     is false (default), the vector is copied, the copy is sorted,
154     and then used to calculate the median.
155
156     Function is a non-mutable function, i.e., \a first and \a last
157     can be const_iterators.
158     
159     Requirements: T should be an iterator over a range of doubles (or
160     any type being convertable to double). If \a sorted is false
161     iterator must be mutable, else read-only iterator is also ok.
162     
163     @return \a p'th percentile of range
164  */
165  template <class T>
166  double percentile(T first, T last, double p, bool sorted=false)
167  {
168    utility::yat_assert<std::range_error>(first<last);
169    utility::yat_assert<std::runtime_error>(p>=0, "percentage is negative");
170    utility::yat_assert<std::runtime_error>(p<=100, 
171                                            "percentage is larger than 100");
172    if (sorted){
173      if (p>=100)
174        return *(--last);
175      // range is one value only is a special case
176      if (std::distance(first, last)==1) 
177        return *first;
178      double j = p/100 * (std::distance(first,last)-1);
179      int i = static_cast<int>(j);
180      return (1-j+floor(j))*first[i] + (j-floor(j))*first[i+1];
181    }
182
183    std::vector<double> v_copy;
184    v_copy.reserve(std::distance(first,last));
185    std::copy(first, last, std::back_inserter(v_copy));
186    double j = p/100 * (v_copy.size()-1);
187    size_t i = static_cast<size_t>(j);
188    if (i+2 < v_copy.size()) {
189      utility::yat_assert<std::out_of_range>(i+2 < v_copy.size(), 
190                                             "in utility::percentile");
191      std::partial_sort(v_copy.begin(), v_copy.begin()+i+2, v_copy.end());
192    }
193    else
194      std::sort(v_copy.begin(), v_copy.end());
195     
196    return percentile(v_copy.begin(), v_copy.end(), p, true);
197  }
198
199  ///
200  /// @brief Computes the skewness of the data in a vector.
201  ///
202  /// The skewness measures the asymmetry of the tails of a
203  /// distribution.
204  ///
205  double skewness(const utility::VectorBase&);
206 
207}}} // of namespace statistics, yat, and theplu
208
209#endif
Note: See TracBrowser for help on using the repository browser.