source: trunk/yat/statistics/KolmogorovSmirnov.h @ 1703

Last change on this file since 1703 was 1701, checked in by Peter, 13 years ago

class KolmogorovSmirnov?:
add (sorted) range (fixes #462)
allow copy and assignment
add a shuffle function
p_value calculation now scales linearly with number of data points and
not N*logN as before

configure.ac: added a test to see if std::multiset::iterator is mutable

  • Property svn:eol-style set to native
  • Property svn:keywords set to Id
File size: 5.1 KB
Line 
1#ifndef _theplu_yat_statistics_kolmogorov_smirnov_
2#define _theplu_yat_statistics_kolmogorov_smirnov_
3
4// $Id: KolmogorovSmirnov.h 1701 2009-01-07 22:53:05Z peter $
5
6/*
7  Copyright (C) 2008, 2009 Peter Johansson
8
9  This file is part of the yat library, http://dev.thep.lu.se/yat
10
11  The yat library is free software; you can redistribute it and/or
12  modify it under the terms of the GNU General Public License as
13  published by the Free Software Foundation; either version 3 of the
14  License, or (at your option) any later version.
15
16  The yat library is distributed in the hope that it will be useful,
17  but WITHOUT ANY WARRANTY; without even the implied warranty of
18  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19  General Public License for more details.
20
21  You should have received a copy of the GNU General Public License
22  along with yat. If not, see <http://www.gnu.org/licenses/>.
23*/
24
25#include <set>
26#include <vector>
27
28namespace theplu {
29namespace yat {
30namespace statistics { 
31
32  /**
33     \brief Kolmogow Smirnov Test
34   */
35  class KolmogorovSmirnov
36  {
37  public:
38    /**
39       struct used to store data in KolmogorovSmirnov.
40       
41       This struct is public to make usage of the add range function
42       more convenient.
43
44       \since New in yat 0.5
45     */
46    struct Element
47    {
48      /**
49         \brief default constructor
50      */
51      inline Element(void) {};
52
53      /**
54         \brief Constructor
55       */
56      inline Element(double x, bool class_label, double w=1.0)
57        : value(x), label(class_label), weight(w) {};
58
59      /**
60         \brief data value
61       */
62      double value;
63     
64      /**
65          bool telling which group the data point belong to
66      */
67      bool label;
68
69      /**
70         weight for the data point
71       */
72      double weight;
73
74      /**
75         \return true if value is less than rhs.value
76       */
77      bool operator<(const Element rhs) const;
78    };
79
80    /**
81       \brief Constructor
82     */ 
83    KolmogorovSmirnov(void);
84
85    /**
86       \brief add a value
87     */
88    void add(double value, bool class_label, double weight=1.0);
89
90    /**
91       \brief add a range
92       
93       value_type of \a ForwardIterator must be convertible to
94       KolmogorovSmirnov::Element
95       
96       Insertion takes typically N*log(N). However, if range is
97       sorted, insertion takes linear time. A common use case of this
98       function is when calculating several KS statistics on a data
99       set (values and weights) with different class labels.
100
101       \since New in yat 0.5
102    */
103    template <typename ForwardIterator>
104    void add(ForwardIterator first, ForwardIterator last); 
105
106    /**
107       \brief Large-Sample Approximation
108
109       This analytical approximation of p-value can be used when all
110       weight equal unity and sample sizes \a n and \a m are
111       large. The p-value is calcuated as \f$ P = \displaystyle - 2
112       \sum_{k=1}^{\infty} (-1)^ke^{-2k^2s^2}\f$, where s is the
113       scaled score:
114
115       \f$ s = \sqrt\frac{nm}{n+m} \f$ score().
116
117       \since New in yat 0.5
118
119       Following Hollander and Wolfe
120    */
121    double p_value(void) const;
122
123    /**
124       \brief p-value
125
126       Performs a permutation test using \a perm label randomizations
127       and calculate how often a score equal or larger than score() is
128       obtained.
129
130       \see shuffle
131    */
132    double p_value(size_t perm) const;
133
134    /**
135       \brief resets everything to zero
136    */
137    void reset(void);
138
139    /**
140       \brief Kolmogorov Smirnov statistic
141
142       \f$ sup_x | F_{\textrm{True}}(x) - F_{\textrm{False}}(x) | \f$ where
143       \f$ F(x) = \frac{\sum_{i:x_i\le x}w_i}{ \sum w_i} \f$
144    */
145    double score(void) const;
146
147    /**
148       \brief shuffle class labels
149
150       This is equivalent to reset and re-add values with shuffled
151       class labels.
152
153       \since New in yat 0.5
154     */
155    void shuffle(void);
156
157    /**
158       Same as score() but keeping the sign, in other words,
159       abs(signed_score())==score()
160
161       \since New in yat 0.5
162     */
163    double signed_score(void) const;
164
165  private:
166    void scores(std::vector<double>&) const;
167    // add weights to sum_w1 and sum_w2 respectively depending on
168    // label in element.
169    template <typename ForwardIterator>
170    void add_sum_w(ForwardIterator first, ForwardIterator last);
171   
172    mutable bool cached_;
173    mutable double score_;
174    typedef std::multiset<Element> data_w;
175    data_w data_;
176    double sum_w1_;
177    double sum_w2_;
178
179    friend std::ostream& operator<<(std::ostream&, const KolmogorovSmirnov&);
180
181    // using compiler generated copy and assignment
182    //KolmogorovSmirnov(const KolmogorovSmirnov&);
183    //KolmogorovSmirnov& operator=(const KolmogorovSmirnov&);
184  };
185
186  /**
187     \brief output operator
188   */
189  std::ostream& operator<<(std::ostream&, const KolmogorovSmirnov&);
190
191
192  //  template implementations
193
194  template <typename ForwardIterator>
195  void KolmogorovSmirnov::add(ForwardIterator first, ForwardIterator last)
196  {
197    ForwardIterator iter(first);
198    typename data_w::const_iterator hint(data_.begin());
199    for ( ; iter!=last; ++iter) 
200      if (iter->weight) // ignore data points with zero weight
201        hint = data_.insert(hint, *iter);
202    add_sum_w(first, last);
203    cached_=false;
204  }
205
206
207  template <typename ForwardIterator>
208  void KolmogorovSmirnov::add_sum_w(ForwardIterator first, 
209                                    ForwardIterator last)
210  {
211    while (first!=last) {
212      if (first->label)
213        sum_w1_ += first->weight;
214      else
215        sum_w2_ += first->weight;
216      ++first;
217    }
218  }
219
220}}} // of namespace theplu yat statistics
221
222#endif
Note: See TracBrowser for help on using the repository browser.