source: trunk/yat/statistics/utility.h @ 1998

Last change on this file since 1998 was 1998, checked in by Peter, 13 years ago

Merged patch release 0.5.4 to the trunk. Delta 0.5.4 - 0.5.3

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 8.0 KB
Line 
1#ifndef _theplu_yat_statistics_utility_
2#define _theplu_yat_statistics_utility_
3
4// $Id: utility.h 1998 2009-06-13 00:34:19Z peter $
5
6/*
7  Copyright (C) 2004 Jari Häkkinen, Peter Johansson
8  Copyright (C) 2005 Peter Johansson
9  Copyright (C) 2006 Jari Häkkinen, Peter Johansson, Markus Ringnér
10  Copyright (C) 2007, 2008 Jari Häkkinen, Peter Johansson
11  Copyright (C) 2009 Peter Johansson
12
13  This file is part of the yat library, http://dev.thep.lu.se/yat
14
15  The yat library is free software; you can redistribute it and/or
16  modify it under the terms of the GNU General Public License as
17  published by the Free Software Foundation; either version 3 of the
18  License, or (at your option) any later version.
19
20  The yat library is distributed in the hope that it will be useful,
21  but WITHOUT ANY WARRANTY; without even the implied warranty of
22  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
23  General Public License for more details.
24
25  You should have received a copy of the GNU General Public License
26  along with yat. If not, see <http://www.gnu.org/licenses/>.
27*/
28
29#include "Percentiler.h"
30
31#include "yat/classifier/DataLookupWeighted1D.h"
32#include "yat/classifier/Target.h"
33#include "yat/utility/deprecate.h"
34#include "yat/utility/iterator_traits.h"
35#include "yat/utility/VectorBase.h"
36#include "yat/utility/yat_assert.h"
37
38#include <algorithm>
39#include <cmath>
40#include <stdexcept>
41#include <vector>
42
43#include <gsl/gsl_statistics_double.h>
44
45namespace theplu {
46namespace yat {
47namespace statistics { 
48
49  /**
50     Adding a range [\a first, \a last) into an object of type T. The
51     requirements for the type T is to have an add(double, bool, double)
52     function.
53  */
54  template <typename T, typename ForwardIterator>
55  void add(T& o, ForwardIterator first, ForwardIterator last,
56           const classifier::Target& target);
57
58  ///
59  /// Calculates the probability to get \a k or smaller from a
60  /// hypergeometric distribution with parameters \a n1 \a n2 \a
61  /// t. Hypergeomtric situation you get in the following situation:
62  /// Let there be \a n1 ways for a "good" selection and \a n2 ways
63  /// for a "bad" selection out of a total of possibilities. Take \a
64  /// t samples without replacement and \a k of those are "good"
65  /// samples. \a k will follow a hypergeomtric distribution.
66  ///
67  /// @return cumulative hypergeomtric distribution functions P(k).
68  ///
69  double cdf_hypergeometric_P(unsigned int k, unsigned int n1, 
70                              unsigned int n2, unsigned int t);
71
72
73  /**
74     \brief one-sided p-value
75
76     This function uses the t-distribution to calculate the one-sided
77     p-value. Given that the true correlation is zero (Null
78     hypothesis) the estimated correlation, r, after a transformation
79     is t-distributed:
80
81     \f$ \sqrt{(n-2)} \frac{r}{\sqrt{(1-r^2)}} \in t(n-2) \f$
82
83     \return Probability that correlation is larger than \a r by
84     chance when having \a n samples. For \a r larger or equal to 1.0,
85     0.0 is returned. For \a r smaller or equal to -1.0, 1.0 is
86     returned.
87   */
88  double pearson_p_value(double r, unsigned int n);
89
90  ///
91  /// @brief Computes the kurtosis of the data in a vector.
92  ///
93  /// The kurtosis measures how sharply peaked a distribution is,
94  /// relative to its width. The kurtosis is normalized to zero for a
95  /// gaussian distribution.
96  ///
97  double kurtosis(const utility::VectorBase&);
98
99
100  ///
101  /// @brief Median absolute deviation from median
102  ///
103  /// Function is non-mutable function
104  ///
105  template <class T>
106  double mad(T first, T last, const bool sorted=false);
107
108
109  ///
110  /// Median is defined to be value in the middle. If number of values
111  /// is even median is the average of the two middle values.  the
112  /// median value is given by p equal to 50. If \a sorted is false
113  /// (default), the range is copied, the copy is sorted, and then
114  /// used to calculate the median.
115  ///
116  /// Function is a non-mutable function, i.e., \a first and \a last
117  /// can be const_iterators.
118  ///
119  /// Requirements: \c T should either be unweighted with value type
120  /// convertible to \c double or \c T should be weighted with value
121  /// type convertible to utility::DataWeight
122  ///
123  /// @return median of range
124  ///
125  template <class T> 
126  double median(T first, T last, const bool sorted=false); 
127
128
129  /**
130     The percentile is determined by the \a p, a number between 0 and
131     100. The percentile is found by interpolation, using the formula
132     \f$ percentile = (1 - \delta) x_i + \delta x_{i+1} \f$ where \a
133     p is floor\f$((n - 1)p/100)\f$ and \f$ \delta \f$ is \f$
134     (n-1)p/100 - i \f$.Thus the minimum value of the vector is given
135     by p equal to zero, the maximum is given by p equal to 100 and
136     the median value is given by p equal to 50. If @a sorted
137     is false (default), the vector is copied, the copy is sorted,
138     and then used to calculate the median.
139
140     Function is a non-mutable function, i.e., \a first and \a last
141     can be const_iterators.
142     
143     Requirements: RandomAccessIterator is an iterator over a range of
144     doubles (or any type being convertable to double).
145     
146     @return \a p'th percentile of range
147
148     \deprecated percentile2 will replace this function in the future
149
150     \note the definition of percentile used here is not identical to
151     that one used in percentile2 and Percentile. The difference is
152     smaller for large ranges.
153  */
154  template <class RandomAccessIterator>
155  double percentile(RandomAccessIterator first, RandomAccessIterator last, 
156                    double p, bool sorted=false) YAT_DEPRECATE;
157 
158
159  /**
160     \see Percentiler
161     
162     \since new in yat 0.5
163   */
164  template <class RandomAccessIterator>
165  double percentile2(RandomAccessIterator first, RandomAccessIterator last, 
166                     double p, bool sorted=false)
167  {
168    Percentiler percentiler(p, sorted);
169    return percentiler(first, last);
170  }
171
172  ///
173  /// @brief Computes the skewness of the data in a vector.
174  ///
175  /// The skewness measures the asymmetry of the tails of a
176  /// distribution.
177  ///
178  double skewness(const utility::VectorBase&);
179 
180
181  //////// template implementations ///////////
182
183  template <typename T, typename ForwardIterator>
184  void add(T& o, ForwardIterator first, ForwardIterator last,
185           const classifier::Target& target)
186  {
187    for (size_t i=0; first!=last; ++i, ++first)
188      o.add(utility::iterator_traits<ForwardIterator>().data(first),
189            target.binary(i), 
190            utility::iterator_traits<ForwardIterator>().weight(first));
191  } 
192
193
194  template <class T>
195  double mad(T first, T last, const bool sorted=false)
196  {
197    double m = median(first, last, sorted);
198    typedef std::vector<typename std::iterator_traits<T>::value_type> Vec;
199    Vec ad;
200    ad.reserve(std::distance(first, last));
201    // copy values (and weights if T is weighted)
202    std::copy(first, last, std::back_insert_iterator<Vec>(ad));
203    utility::iterator_traits<typename Vec::iterator> traits;
204    for (typename Vec::iterator i=ad.begin() ; i!=ad.end(); ++i) {
205      // adjust data
206      traits.data(i) = std::abs(traits.data(i)-m);
207    }
208    std::sort(ad.begin(), ad.end());
209    return median(ad.begin(), ad.end(), true);
210  }
211 
212
213  template <class T> 
214  double median(T first, T last, const bool sorted=false) 
215  { 
216    return percentile2(first, last, 50.0, sorted); 
217  }
218
219
220  template <class RandomAccessIterator>
221  double percentile(RandomAccessIterator first, RandomAccessIterator last, 
222                    double p, bool sorted=false)
223  {
224    // range is one value only is a special case
225    if (first+1 == last)
226      return utility::iterator_traits<RandomAccessIterator>().data(first);
227    if (sorted) {
228      // have to take care of this special case
229      if (p>=100)
230        return utility::iterator_traits<RandomAccessIterator>().data(--last);
231      double j = p/100 * (std::distance(first,last)-1);
232      int i = static_cast<int>(j);
233      return (1-j+floor(j))*first[i] + (j-floor(j))*first[i+1];
234    }
235
236    std::vector<typename std::iterator_traits<RandomAccessIterator>::value_type> 
237      v_copy;
238    v_copy.reserve(std::distance(first,last));
239    std::copy(first, last, std::back_inserter(v_copy));
240    size_t i = static_cast<size_t>(p/100 * (v_copy.size()-1));
241    if (i+2 < v_copy.size()) {
242      std::partial_sort(v_copy.begin(), v_copy.begin()+i+2, v_copy.end());
243    }
244    else
245      std::sort(v_copy.begin(), v_copy.end());
246    return percentile(v_copy.begin(), v_copy.end(), p, true);
247  }
248
249
250}}} // of namespace statistics, yat, and theplu
251
252#endif
Note: See TracBrowser for help on using the repository browser.