source: trunk/yat/statistics/utility.h @ 1275

Last change on this file since 1275 was 1275, checked in by Jari Häkkinen, 13 years ago

Updating copyright statements.

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 6.9 KB
Line 
1#ifndef _theplu_yat_statistics_utility_
2#define _theplu_yat_statistics_utility_
3
4// $Id: utility.h 1275 2008-04-11 06:10:12Z jari $
5
6/*
7  Copyright (C) 2004 Jari Häkkinen, Peter Johansson
8  Copyright (C) 2005 Peter Johansson
9  Copyright (C) 2006 Jari Häkkinen, Peter Johansson, Markus Ringnér
10  Copyright (C) 2007 Jari Häkkinen, Peter Johansson
11  Copyright (C) 2008 Peter Johansson
12
13  This file is part of the yat library, http://trac.thep.lu.se/yat
14
15  The yat library is free software; you can redistribute it and/or
16  modify it under the terms of the GNU General Public License as
17  published by the Free Software Foundation; either version 2 of the
18  License, or (at your option) any later version.
19
20  The yat library is distributed in the hope that it will be useful,
21  but WITHOUT ANY WARRANTY; without even the implied warranty of
22  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
23  General Public License for more details.
24
25  You should have received a copy of the GNU General Public License
26  along with this program; if not, write to the Free Software
27  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
28  02111-1307, USA.
29*/
30
31#include "yat/classifier/DataLookupWeighted1D.h"
32#include "yat/classifier/Target.h"
33#include "yat/utility/VectorBase.h"
34#include "yat/utility/yat_assert.h"
35
36#include <algorithm>
37#include <cmath>
38#include <stdexcept>
39#include <vector>
40
41#include <gsl/gsl_statistics_double.h>
42
43namespace theplu {
44namespace yat {
45namespace statistics { 
46
47  //forward declarations
48  template <class T> 
49  double median(T first, T last, const bool sorted=false); 
50
51  template <class T>
52  double percentile(T first, T last, double p, bool sorted=false);
53 
54  /**
55     Adding a range [\a first, \a last) into an object of type T. The
56     requirements for the type T is to have an add(double, bool, double)
57     function.
58  */
59  template <typename T, typename ForwardIterator>
60  void add(T& o, ForwardIterator first, ForwardIterator last,
61           const classifier::Target& target)
62  {
63    for (size_t i=0; first!=last; ++i, ++first)
64      o.add(utility::iterator_traits<ForwardIterator>().data(first),
65            target.binary(i), 
66            utility::iterator_traits<ForwardIterator>().weight(first));
67  } 
68
69  ///
70  /// Calculates the probability to get \a k or smaller from a
71  /// hypergeometric distribution with parameters \a n1 \a n2 \a
72  /// t. Hypergeomtric situation you get in the following situation:
73  /// Let there be \a n1 ways for a "good" selection and \a n2 ways
74  /// for a "bad" selection out of a total of possibilities. Take \a
75  /// t samples without replacement and \a k of those are "good"
76  /// samples. \a k will follow a hypergeomtric distribution.
77  ///
78  /// @return cumulative hypergeomtric distribution functions P(k).
79  ///
80  double cdf_hypergeometric_P(unsigned int k, unsigned int n1, 
81                              unsigned int n2, unsigned int t);
82
83
84  /**
85     \brief one-sided p-value
86
87     This function uses the t-distribution to calculate the one-sided
88     p-value. Given that the true correlation is zero (Null
89     hypothesis) the estimated correlation, r, after a transformation
90     is t-distributed:
91
92     \f$ \sqrt{(n-2)} \frac{r}{\sqrt{(1-r^2)}} \in t(n-2) \f$
93
94     \return Probability that correlation is larger than \a r by
95     chance when having \a n samples.
96   */
97  double pearson_p_value(double r, unsigned int n);
98
99  ///
100  /// @brief Computes the kurtosis of the data in a vector.
101  ///
102  /// The kurtosis measures how sharply peaked a distribution is,
103  /// relative to its width. The kurtosis is normalized to zero for a
104  /// gaussian distribution.
105  ///
106  double kurtosis(const utility::VectorBase&);
107
108
109  ///
110  /// @brief Median absolute deviation from median
111  ///
112  /// Function is non-mutable function
113  ///
114  template <class T>
115  double mad(T first, T last, const bool sorted=false)
116  {
117    double m = median(first, last, sorted);
118    std::vector<double> ad;
119    ad.reserve(std::distance(first, last));
120    for( ; first!=last; ++first)
121      ad.push_back(fabs(*first-m));
122    std::sort(ad.begin(), ad.end());
123    return median(ad.begin(), ad.end(), true);
124  }
125 
126
127  ///
128  /// Median is defined to be value in the middle. If number of values
129  /// is even median is the average of the two middle values.  the
130  /// median value is given by p equal to 50. If \a sorted is false
131  /// (default), the range is copied, the copy is sorted, and then
132  /// used to calculate the median.
133  ///
134  /// Function is a non-mutable function, i.e., \a first and \a last
135  /// can be const_iterators.
136  ///
137  /// Requirements: T should be an iterator over a range of doubles (or
138  /// any type being convertable to double).
139  ///
140  /// @return median of range
141  ///
142  template <class T> 
143  double median(T first, T last, const bool sorted=false) 
144  { return percentile(first, last, 50.0, sorted); }
145
146  /**
147     The percentile is determined by the \a p, a number between 0 and
148     100. The percentile is found by interpolation, using the formula
149     \f$ percentile = (1 - \delta) x_i + \delta x_{i+1} \f$ where \a
150     p is floor\f$((n - 1)p/100)\f$ and \f$ \delta \f$ is \f$
151     (n-1)p/100 - i \f$.Thus the minimum value of the vector is given
152     by p equal to zero, the maximum is given by p equal to 100 and
153     the median value is given by p equal to 50. If @a sorted
154     is false (default), the vector is copied, the copy is sorted,
155     and then used to calculate the median.
156
157     Function is a non-mutable function, i.e., \a first and \a last
158     can be const_iterators.
159     
160     Requirements: T should be an iterator over a range of doubles (or
161     any type being convertable to double). If \a sorted is false
162     iterator must be mutable, else read-only iterator is also ok.
163     
164     @return \a p'th percentile of range
165  */
166  template <class T>
167  double percentile(T first, T last, double p, bool sorted=false)
168  {
169    utility::yat_assert<std::range_error>(first<last);
170    utility::yat_assert<std::runtime_error>(p>=0, "percentage is negative");
171    utility::yat_assert<std::runtime_error>(p<=100, 
172                                            "percentage is larger than 100");
173    if (sorted){
174      if (p>=100)
175        return *(--last);
176      // range is one value only is a special case
177      if (std::distance(first, last)==1) 
178        return *first;
179      double j = p/100 * (std::distance(first,last)-1);
180      int i = static_cast<int>(j);
181      return (1-j+floor(j))*first[i] + (j-floor(j))*first[i+1];
182    }
183
184    std::vector<double> v_copy;
185    v_copy.reserve(std::distance(first,last));
186    std::copy(first, last, std::back_inserter(v_copy));
187    double j = p/100 * (v_copy.size()-1);
188    size_t i = static_cast<size_t>(j);
189    if (i+2 < v_copy.size()) {
190      utility::yat_assert<std::out_of_range>(i+2 < v_copy.size(), 
191                                             "in utility::percentile");
192      std::partial_sort(v_copy.begin(), v_copy.begin()+i+2, v_copy.end());
193    }
194    else
195      std::sort(v_copy.begin(), v_copy.end());
196     
197    return percentile(v_copy.begin(), v_copy.end(), p, true);
198  }
199
200  ///
201  /// @brief Computes the skewness of the data in a vector.
202  ///
203  /// The skewness measures the asymmetry of the tails of a
204  /// distribution.
205  ///
206  double skewness(const utility::VectorBase&);
207 
208}}} // of namespace statistics, yat, and theplu
209
210#endif
Note: See TracBrowser for help on using the repository browser.