source: trunk/yat/statistics/utility.h @ 934

Last change on this file since 934 was 934, checked in by Peter, 16 years ago

fixing bug when range is one value only

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 6.4 KB
Line 
1#ifndef _theplu_yat_statistics_utility_
2#define _theplu_yat_statistics_utility_
3
4// $Id: utility.h 934 2007-10-05 21:27:34Z peter $
5
6/*
7  Copyright (C) 2004 Jari Häkkinen, Peter Johansson
8  Copyright (C) 2005 Peter Johansson
9  Copyright (C) 2006 Jari Häkkinen, Markus Ringnér, Peter Johansson
10  Copyright (C) 2007 Jari Häkkinen, Peter Johansson
11
12  This file is part of the yat library, http://trac.thep.lu.se/trac/yat
13
14  The yat library is free software; you can redistribute it and/or
15  modify it under the terms of the GNU General Public License as
16  published by the Free Software Foundation; either version 2 of the
17  License, or (at your option) any later version.
18
19  The yat library is distributed in the hope that it will be useful,
20  but WITHOUT ANY WARRANTY; without even the implied warranty of
21  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
22  General Public License for more details.
23
24  You should have received a copy of the GNU General Public License
25  along with this program; if not, write to the Free Software
26  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
27  02111-1307, USA.
28*/
29
30#include "yat/classifier/DataLookupWeighted1D.h"
31#include "yat/classifier/Target.h"
32#include "yat/utility/vector.h"
33#include "yat/utility/yat_assert.h"
34
35#include <algorithm>
36#include <cmath>
37#include <vector>
38
39#include <gsl/gsl_statistics_double.h>
40
41namespace theplu {
42namespace yat {
43namespace statistics { 
44
45  //forward declarations
46  template <class T> 
47  double median(T first, T last, const bool sorted=false); 
48
49  template <class T>
50  double percentile(T first, T last, double p, bool sorted=false);
51 
52  /**
53     Adding each value in an array \a v \a to an object \a o.  The
54     requirements for the type T1 is to have an add(double, bool)
55     function, and for T2 of the array \a v are: operator[] returning
56     an element and function size() returning the number of elements.
57  */
58  template <typename T1, typename T2>
59  void add(T1& o, const T2& v, const classifier::Target& target)
60  {
61    for (size_t i=0; i<v.size(); ++i) 
62      o.add(v[i],target.binary(i));
63  } 
64
65  /**
66     Adding each value in an array \a v \a to an object \a o.  The
67     requirements for the type T1 is to have an add(double, bool)
68     function, and for T2 of the array \a v are: operator[] returning
69     an element and function size() returning the number of elements.
70  */
71  template <typename T1>
72  void add(T1& o, const classifier::DataLookupWeighted1D& v, 
73           const classifier::Target& target)
74  {
75    for (size_t i=0; i<v.size(); ++i) 
76      o.add(v.data(i),target.binary(i),v.weight(i));
77  } 
78
79  ///
80  /// Calculates the probability to get \a k or smaller from a
81  /// hypergeometric distribution with parameters \a n1 \a n2 \a
82  /// t. Hypergeomtric situation you get in the following situation:
83  /// Let there be \a n1 ways for a "good" selection and \a n2 ways
84  /// for a "bad" selection out of a total of possibilities. Take \a
85  /// t samples without replacement and \a k of those are "good"
86  /// samples. \a k will follow a hypergeomtric distribution.
87  ///
88  /// @return cumulative hypergeomtric distribution functions P(k).
89  ///
90  double cdf_hypergeometric_P(u_int k, u_int n1, u_int n2, u_int t);
91
92
93  ///
94  /// @brief Computes the kurtosis of the data in a vector.
95  ///
96  /// The kurtosis measures how sharply peaked a distribution is,
97  /// relative to its width. The kurtosis is normalized to zero for a
98  /// gaussian distribution.
99  ///
100  double kurtosis(const utility::vector&);
101
102
103  ///
104  /// @brief Median absolute deviation from median
105  ///
106  /// Function is non-mutable function
107  ///
108  template <class T>
109  double mad(T first, T last, const bool sorted=false)
110  {
111    double m = median(first, last, sorted);
112    std::vector<double> ad;
113    ad.reserve(std::distance(first, last));
114    for( ; first!=last; ++first)
115      ad.push_back(fabs(*first-m));
116    std::sort(ad.begin(), ad.end());
117    return median(ad.begin(), ad.end(), true);
118  }
119 
120
121  ///
122  /// Median is defined to be value in the middle. If number of values
123  /// is even median is the average of the two middle values.  the
124  /// median value is given by p equal to 50. If \a sorted is false
125  /// (default), the range is copied, the copy is sorted, and then
126  /// used to calculate the median.
127  ///
128  /// Function is a non-mutable function, i.e., \a first and \a last
129  /// can be const_iterators.
130  ///
131  /// Requirements: T should be an iterator over a range of doubles (or
132  /// any type being convertable to double).
133  ///
134  /// @return median of range
135  ///
136  template <class T> 
137  double median(T first, T last, const bool sorted=false) 
138  { return percentile(first, last, 50.0, sorted); }
139
140  /**
141     The percentile is determined by the \a p, a number between 0 and
142     100. The percentile is found by interpolation, using the formula
143     \f$ percentile = (1 - \delta) x_i + \delta x_{i+1} \f$ where \a
144     p is floor\f$((n - 1)p/100)\f$ and \f$ \delta \f$ is \f$
145     (n-1)p/100 - i \f$.Thus the minimum value of the vector is given
146     by p equal to zero, the maximum is given by p equal to 100 and
147     the median value is given by p equal to 50. If @a sorted
148     is false (default), the vector is copied, the copy is sorted,
149     and then used to calculate the median.
150
151     Function is a non-mutable function, i.e., \a first and \a last
152     can be const_iterators.
153     
154     Requirements: T should be an iterator over a range of doubles (or
155     any type being convertable to double). If \a sorted is false
156     iterator must be mutable, else read-only iterator is also ok.
157     
158     @return \a p'th percentile of range
159  */
160  template <class T>
161  double percentile(T first, T last, double p, bool sorted=false)
162  {
163    yat_assert(first<last && "range is invalid");
164    yat_assert(p>=0);
165    yat_assert(p<=100);
166    if (sorted){
167      if (p>=100)
168        return *(--last);
169      // range is one value only is a special case
170      if (std::distance(first, last)==1) 
171        return *first;
172      double j = p/100 * (std::distance(first,last)-1);
173      int i = static_cast<int>(j);
174      return (1-j+floor(j))*first[i] + (j-floor(j))*first[i+1];
175    }
176
177    std::vector<double> v_copy;
178    v_copy.reserve(std::distance(first,last));
179    std::copy(first, last, std::back_inserter(v_copy));
180    double j = p/100 * (v_copy.size()-1);
181    int i = static_cast<int>(j);
182    std::partial_sort(v_copy.begin(),v_copy.begin()+i+2 , v_copy.end());
183    return percentile(v_copy.begin(), v_copy.end(), p, true);
184  }
185
186  ///
187  /// @brief Computes the skewness of the data in a vector.
188  ///
189  /// The skewness measures the asymmetry of the tails of a
190  /// distribution.
191  ///
192  double skewness(const utility::vector&);
193 
194}}} // of namespace statistics, yat, and theplu
195
196#endif
Note: See TracBrowser for help on using the repository browser.