source: trunk/c++_tools/statistics/utility.h @ 588

Last change on this file since 588 was 588, checked in by Markus Ringnér, 15 years ago

Added statistics functions to calculate the higher moments: skewness and kurtosis using gsl functions

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 4.6 KB
Line 
1// $Id: utility.h 588 2006-06-21 15:43:29Z markus $
2
3#ifndef _theplu_statistics_utility_
4#define _theplu_statistics_utility_
5
6#include <c++_tools/gslapi/vector.h>
7
8#include <algorithm>
9#include <cassert>
10#include <cmath>
11#include <vector>
12
13#include <gsl/gsl_statistics_double.h>
14
15namespace theplu {
16namespace statistics { 
17
18  //forward declarations
19  template <class T>
20  double percentile(const std::vector<T>& vec, const double p, 
21                    const bool sorted=false);
22 
23
24  ///
25  /// Calculates the probability to get \a k or smaller from a
26  /// hypergeometric distribution with parameters \a n1 \a n2 \a
27  /// t. Hypergeomtric situation you get in the following situation:
28  /// Let there be \a n1 ways for a "good" selection and \a n2 ways
29  /// for a "bad" selection out of a total of possibilities. Take \a
30  /// t samples without replacement and \a k of those are "good"
31  /// samples. \a k will follow a hypergeomtric distribution.
32  /// @cumulative hypergeomtric distribution functions P(k).
33  ///
34  double cdf_hypergeometric_P(u_int k, u_int n1, u_int n2, u_int t);
35
36
37  ///
38  /// Computes the kurtosis of the data in a vector. The kurtosis
39  /// measures how sharply peaked a distribution is, relative to its
40  /// width. The kurtosis is normalized to zero for a gaussian
41  /// distribution.
42  ///
43  inline double kurtosis(const gslapi::vector& v)
44  {
45    const gsl_vector* gvp=v.gsl_vector_p();
46    return gsl_stats_kurtosis(gvp->data,gvp->stride,gvp->size);
47  }
48
49
50  ///
51  /// Median is defined to be value in the middle. If number of values
52  /// is even median is the average of the two middle values.  the
53  /// median value is given by p equal to 50. If @a sorted is false
54  /// (default), the vector is copied, the copy is sorted, and then
55  /// used to calculate the median.
56  ///
57  /// @return median
58  ///
59  /// @note interface will change
60  ///
61  template <class T> 
62  inline double median(const std::vector<T>& v, const bool sorted=false) 
63  { return percentile(v, 50.0, sorted); }
64
65  ///
66  /// Median is defined to be value in the middle. If number of values
67  /// is even median is the average of the two middle values. If @a
68  /// sorted is true, the function assumes vector @a vec to be
69  /// sorted. If @a sorted is false, the vector is copied, the copy is
70  /// sorted (default), and then used to calculate the median.
71  ///
72  /// @return median
73  ///
74  double median(const gslapi::vector& vec, const bool sorted=false);
75
76  ///
77  /// The percentile is determined by the \a p, a number between 0 and
78  /// 100. The percentile is found by interpolation, using the formula
79  /// \f$ percentile = (1 - \delta) x_i + \delta x_{i+1} \f$ where \a
80  /// p is floor\f$((n - 1)p/100)\f$ and \f$ \delta \f$ is \f$
81  /// (n-1)p/100 - i \f$.Thus the minimum value of the vector is given
82  /// by p equal to zero, the maximum is given by p equal to 100 and
83  /// the median value is given by p equal to 50. If @a sorted
84  /// is false (default), the vector is copied, the copy is sorted,
85  /// and then used to calculate the median.
86  ///
87  /// @return \a p'th percentile
88  ///
89  template <class T>
90  double percentile(const std::vector<T>& vec, const double p, 
91                    const bool sorted=false)
92  {
93    assert(!(p>100 && p<0));
94    if (sorted){
95      if (p>=100)
96        return vec.back();
97      double j = p/100 * (vec.size()-1);
98      int i = static_cast<int>(j);
99      return (1-j+floor(j))*vec[i] + (j-floor(j))*vec[i+1];
100    }
101    if (p==100)
102      return  *std::max_element(vec.begin(),vec.end());
103    std::vector<T> v_copy(vec);
104    double j = p/100 * (v_copy.size()-1);
105    int i = static_cast<int>(j);
106    std::partial_sort(v_copy.begin(),v_copy.begin()+i+2 , v_copy.end());
107    return (1-j+floor(j))*v_copy[i] + (j-floor(j))*v_copy[i+1];
108 
109  }
110
111  ///
112  /// The percentile is determined by the \a p, a number between 0 and
113  /// 100. The percentile is found by interpolation, using the formula
114  /// \f$ percentile = (1 - \delta) x_i + \delta x_{i+1} \f$ where \a
115  /// p is floor\f$((n - 1)p/100)\f$ and \f$ \delta \f$ is \f$
116  /// (n-1)p/100 - i \f$.Thus the minimum value of the vector is given
117  /// by p equal to zero, the maximum is given by p equal to 100 and
118  /// the median value is given by p equal to 50. If @a sorted
119  /// is false (default), the vector is copied, the copy is sorted,
120  /// and then used to calculate the median.
121  ///
122  /// @return \a p'th percentile
123  ///
124  double percentile(const gslapi::vector& vec, const double, 
125                    const bool sorted=false);
126
127  ///
128  /// Computes the skewness of the data in a vector. The skewness
129  /// measures the asymmetry of the tails of a distribution.
130  ///
131  inline double skewness(const gslapi::vector& v)
132  {
133    const gsl_vector* gvp=v.gsl_vector_p();
134    return gsl_stats_skew(gvp->data,gvp->stride,gvp->size);
135  }
136 
137
138}} // of namespace statistics and namespace theplu
139
140#endif
141
Note: See TracBrowser for help on using the repository browser.