source: trunk/yat/statistics/KolmogorovSmirnov.h @ 3018

Last change on this file since 3018 was 3018, checked in by Peter, 10 years ago

merge patch release 0.10.2 into trunk

  • Property svn:eol-style set to native
  • Property svn:keywords set to Id
File size: 5.8 KB
Line 
1#ifndef _theplu_yat_statistics_kolmogorov_smirnov_
2#define _theplu_yat_statistics_kolmogorov_smirnov_
3
4// $Id: KolmogorovSmirnov.h 3018 2013-04-04 04:46:38Z peter $
5
6/*
7  Copyright (C) 2008 Jari Häkkinen, Peter Johansson
8  Copyright (C) 2009, 2010, 2011, 2012, 2013 Peter Johansson
9
10  This file is part of the yat library, http://dev.thep.lu.se/yat
11
12  The yat library is free software; you can redistribute it and/or
13  modify it under the terms of the GNU General Public License as
14  published by the Free Software Foundation; either version 3 of the
15  License, or (at your option) any later version.
16
17  The yat library is distributed in the hope that it will be useful,
18  but WITHOUT ANY WARRANTY; without even the implied warranty of
19  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20  General Public License for more details.
21
22  You should have received a copy of the GNU General Public License
23  along with yat. If not, see <http://www.gnu.org/licenses/>.
24*/
25
26#include <boost/concept_check.hpp>
27
28#include <iosfwd>
29#include <set>
30#include <vector>
31
32namespace theplu {
33namespace yat {
34namespace statistics {
35
36  /**
37     \brief Kolmogorov Smirnov Test
38   */
39  class KolmogorovSmirnov
40  {
41  public:
42    /**
43       struct used to store data in KolmogorovSmirnov.
44
45       This struct is public to make usage of the add range function
46       more convenient.
47
48       \since New in yat 0.5
49     */
50    struct Element
51    {
52      /**
53         \brief default constructor
54      */
55      Element(void);
56
57      /**
58         \brief Constructor
59       */
60      Element(double x, bool class_label, double w=1.0);
61
62      /**
63         \brief data value
64       */
65      double value;
66
67      /**
68          bool telling which group the data point belongs to
69      */
70      bool label;
71
72      /**
73         weight for the data point
74       */
75      double weight;
76
77      /**
78         \return true if value is less than rhs.value
79       */
80      bool operator<(const Element& rhs) const;
81    };
82
83    /**
84       \brief Constructor
85     */
86    KolmogorovSmirnov(void);
87
88    /**
89       \brief add a value
90     */
91    void add(double value, bool class_label, double weight=1.0);
92
93    /**
94       \brief add a range
95
96       value_type of \a ForwardIterator must be convertible to
97       KolmogorovSmirnov::Element
98
99       Insertion takes typically N*log(N). However, if range is
100       sorted, insertion takes linear time. A common use case of this
101       function is when calculating several KS statistics on a data
102       set (values and weights) with different class labels.
103
104       \since New in yat 0.5
105    */
106    template <typename ForwardIterator>
107    void add(ForwardIterator first, ForwardIterator last);
108
109    /**
110       \brief Large-Sample Approximation
111
112       This analytical approximation of p-value can be used when all
113       weight equal unity and sample sizes \a n and \a m are
114       large. The p-value is calcuated as \f$ P = \displaystyle - 2
115       \sum_{k=1}^{\infty} (-1)^ke^{-2k^2s^2}\f$, where s is the
116       scaled score:
117
118       \f$ s = \sqrt\frac{nm}{n+m} \f$ score().
119
120       \since New in yat 0.5
121
122       Following Hollander and Wolfe
123    */
124    double p_value(void) const;
125
126    /**
127       \brief p-value
128
129       Performs a permutation test using \a perm label randomizations
130       and calculate how often a score equal or larger than score() is
131       obtained.
132
133       \see shuffle
134    */
135    double p_value(size_t perm) const;
136
137    /**
138       \brief Remove a data point
139
140       \throw utility::runtime_error if no data point exist with \a
141       value, \a class_label, and \a weight.
142
143       \since New in yat 0.9
144     */
145    void remove(double value, bool class_label, double weight=1.0);
146
147    /**
148       \brief resets everything to zero
149    */
150    void reset(void);
151
152    /**
153       \brief Kolmogorov Smirnov statistic
154
155       \f$ sup_x | F_{\textrm{True}}(x) - F_{\textrm{False}}(x) | \f$ where
156       \f$ F(x) = \frac{\sum_{i:x_i\le x}w_i}{ \sum w_i} \f$
157    */
158    double score(void) const;
159
160    /**
161       \brief shuffle class labels
162
163       This is equivalent to reset and re-add values with shuffled
164       class labels.
165
166       \since New in yat 0.5
167     */
168    void shuffle(void);
169
170    /**
171       Same as score() but keeping the sign, in other words,
172       abs(signed_score())==score()
173
174       A positive score implies that values in class \c true \em on
175       \em average are smaller than values in class \c false.
176
177       \since New in yat 0.5
178     */
179    double signed_score(void) const;
180
181  private:
182    void scores(std::vector<double>&) const;
183    // add weights to sum_w1 and sum_w2 respectively depending on
184    // label in element.
185    template <typename ForwardIterator>
186    void add_sum_w(ForwardIterator first, ForwardIterator last);
187
188    mutable bool cached_;
189    mutable double score_;
190    typedef std::multiset<Element> data_w;
191    data_w data_;
192    double sum_w1_;
193    double sum_w2_;
194
195    friend std::ostream& operator<<(std::ostream&, const KolmogorovSmirnov&);
196
197    // using compiler generated copy and assignment
198    //KolmogorovSmirnov(const KolmogorovSmirnov&);
199    //KolmogorovSmirnov& operator=(const KolmogorovSmirnov&);
200  };
201
202  /**
203     \brief output operator
204
205     \relates KolmogorovSmirnov
206   */
207  std::ostream& operator<<(std::ostream&, const KolmogorovSmirnov&);
208
209
210  //  template implementations
211
212  template <typename ForwardIterator>
213  void KolmogorovSmirnov::add(ForwardIterator first, ForwardIterator last)
214  {
215    BOOST_CONCEPT_ASSERT((boost::ForwardIterator<ForwardIterator>));
216    typedef typename std::iterator_traits<ForwardIterator>::reference ref;
217    BOOST_CONCEPT_ASSERT((boost::Convertible<ref, KolmogorovSmirnov::Element>));
218    ForwardIterator iter(first);
219    typename data_w::const_iterator hint(data_.begin());
220    for ( ; iter!=last; ++iter)
221      if ((*iter).weight) // ignore data points with zero weight
222        hint = data_.insert(hint, *iter);
223    add_sum_w(first, last);
224    cached_=false;
225  }
226
227
228  template <typename ForwardIterator>
229  void KolmogorovSmirnov::add_sum_w(ForwardIterator first,
230                                    ForwardIterator last)
231  {
232    while (first!=last) {
233      if ((*first).label)
234        sum_w1_ += (*first).weight;
235      else
236        sum_w2_ += (*first).weight;
237      ++first;
238    }
239  }
240
241}}} // of namespace theplu yat statistics
242
243#endif
Note: See TracBrowser for help on using the repository browser.