source: trunk/yat/statistics/KolmogorovSmirnov.h

Last change on this file was 3550, checked in by Peter, 4 years ago

Update copyright years. Happy New Year

  • Property svn:eol-style set to native
  • Property svn:keywords set to Id
File size: 5.9 KB
Line 
1#ifndef _theplu_yat_statistics_kolmogorov_smirnov_
2#define _theplu_yat_statistics_kolmogorov_smirnov_
3
4// $Id: KolmogorovSmirnov.h 3550 2017-01-03 05:41:02Z peter $
5
6/*
7  Copyright (C) 2008 Jari Häkkinen, Peter Johansson
8  Copyright (C) 2009, 2010, 2011, 2012, 2013, 2016 Peter Johansson
9
10  This file is part of the yat library, http://dev.thep.lu.se/yat
11
12  The yat library is free software; you can redistribute it and/or
13  modify it under the terms of the GNU General Public License as
14  published by the Free Software Foundation; either version 3 of the
15  License, or (at your option) any later version.
16
17  The yat library is distributed in the hope that it will be useful,
18  but WITHOUT ANY WARRANTY; without even the implied warranty of
19  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20  General Public License for more details.
21
22  You should have received a copy of the GNU General Public License
23  along with yat. If not, see <http://www.gnu.org/licenses/>.
24*/
25
26#include "yat/utility/concept_check.h"
27
28#include <boost/concept_check.hpp>
29#include <boost/iterator/iterator_concepts.hpp>
30
31#include <iosfwd>
32#include <set>
33#include <vector>
34
35namespace theplu {
36namespace yat {
37namespace statistics {
38
39  /**
40     \brief Kolmogorov Smirnov Test
41   */
42  class KolmogorovSmirnov
43  {
44  public:
45    /**
46       struct used to store data in KolmogorovSmirnov.
47
48       This struct is public to make usage of the add range function
49       more convenient.
50
51       \since New in yat 0.5
52     */
53    struct Element
54    {
55      /**
56         \brief default constructor
57      */
58      Element(void);
59
60      /**
61         \brief Constructor
62       */
63      Element(double x, bool class_label, double w=1.0);
64
65      /**
66         \brief data value
67       */
68      double value;
69
70      /**
71          bool telling which group the data point belongs to
72      */
73      bool label;
74
75      /**
76         weight for the data point
77       */
78      double weight;
79
80      /**
81         \return true if value is less than rhs.value
82       */
83      bool operator<(const Element& rhs) const;
84    };
85
86    /**
87       \brief Constructor
88     */
89    KolmogorovSmirnov(void);
90
91    /**
92       \brief add a value
93     */
94    void add(double value, bool class_label, double weight=1.0);
95
96    /**
97       \brief add a range
98
99       Type Requirements:
100       - \c ForwardIterator models \readable_iterator
101       - \c ForwardIterator models \forward_traversal_iterator
102       - \c value_type of \c ForwardIterator must be convertible to
103       \c KolmogorovSmirnov::Element
104
105       Insertion takes typically N*log(N). However, if range is
106       sorted, insertion takes linear time. A common use case of this
107       function is when calculating several KS statistics on a data
108       set (values and weights) with different class labels.
109
110       \since New in yat 0.5
111    */
112    template <typename ForwardIterator>
113    void add(ForwardIterator first, ForwardIterator last);
114
115    /**
116       \brief Large-Sample Approximation
117
118       This analytical approximation of p-value can be used when all
119       weight equal unity and sample sizes \a n and \a m are
120       large. The p-value is calcuated as \f$ P = \displaystyle - 2
121       \sum_{k=1}^{\infty} (-1)^ke^{-2k^2s^2}\f$, where s is the
122       scaled score:
123
124       \f$ s = \sqrt\frac{nm}{n+m} \f$ score().
125
126       \since New in yat 0.5
127
128       Following Hollander and Wolfe
129    */
130    double p_value(void) const;
131
132    /**
133       \brief p-value
134
135       Performs a permutation test using \a perm label randomizations
136       and calculate how often a score equal or larger than score() is
137       obtained.
138
139       \see shuffle
140    */
141    double p_value(size_t perm) const;
142
143    /**
144       \brief Remove a data point
145
146       \throw utility::runtime_error if no data point exist with \a
147       value, \a class_label, and \a weight.
148
149       \since New in yat 0.9
150     */
151    void remove(double value, bool class_label, double weight=1.0);
152
153    /**
154       \brief resets everything to zero
155    */
156    void reset(void);
157
158    /**
159       \brief Kolmogorov Smirnov statistic
160
161       \f$ sup_x | F_{\textrm{True}}(x) - F_{\textrm{False}}(x) | \f$ where
162       \f$ F(x) = \frac{\sum_{i:x_i\le x}w_i}{ \sum w_i} \f$
163    */
164    double score(void) const;
165
166    /**
167       \brief shuffle class labels
168
169       This is equivalent to reset and re-add values with shuffled
170       class labels.
171
172       \since New in yat 0.5
173     */
174    void shuffle(void);
175
176    /**
177       Same as score() but keeping the sign, in other words,
178       abs(signed_score())==score()
179
180       A positive score implies that values in class \c true \em on
181       \em average are smaller than values in class \c false.
182
183       \since New in yat 0.5
184     */
185    double signed_score(void) const;
186
187  private:
188    void scores(std::vector<double>&) const;
189    // add weights to sum_w1 and sum_w2 respectively depending on
190    // label in element.
191    template <typename ForwardIterator>
192    void add_sum_w(ForwardIterator first, ForwardIterator last);
193
194    mutable bool cached_;
195    mutable double score_;
196    typedef std::multiset<Element> data_w;
197    data_w data_;
198    double sum_w1_;
199    double sum_w2_;
200
201    friend std::ostream& operator<<(std::ostream&, const KolmogorovSmirnov&);
202
203    // using compiler generated copy and assignment
204    //KolmogorovSmirnov(const KolmogorovSmirnov&);
205    //KolmogorovSmirnov& operator=(const KolmogorovSmirnov&);
206  };
207
208  /**
209     \brief output operator
210
211     \relates KolmogorovSmirnov
212   */
213  std::ostream& operator<<(std::ostream&, const KolmogorovSmirnov&);
214
215
216  //  template implementations
217
218  template <typename ForwardIterator>
219  void KolmogorovSmirnov::add(ForwardIterator first, ForwardIterator last)
220  {
221    BOOST_CONCEPT_ASSERT((boost_concepts::ForwardTraversal<ForwardIterator>));
222    BOOST_CONCEPT_ASSERT((boost_concepts::ReadableIterator<ForwardIterator>));
223    ForwardIterator iter(first);
224    typename data_w::const_iterator hint(data_.begin());
225    for ( ; iter!=last; ++iter)
226      if ((*iter).weight) // ignore data points with zero weight
227        hint = data_.insert(hint, *iter);
228    add_sum_w(first, last);
229    cached_=false;
230  }
231
232
233  template <typename ForwardIterator>
234  void KolmogorovSmirnov::add_sum_w(ForwardIterator first,
235                                    ForwardIterator last)
236  {
237    while (first!=last) {
238      if ((*first).label)
239        sum_w1_ += (*first).weight;
240      else
241        sum_w2_ += (*first).weight;
242      ++first;
243    }
244  }
245
246}}} // of namespace theplu yat statistics
247
248#endif
Note: See TracBrowser for help on using the repository browser.