source: trunk/yat/statistics/KolmogorovSmirnov.cc @ 1701

Last change on this file since 1701 was 1701, checked in by Peter, 12 years ago

class KolmogorovSmirnov?:
add (sorted) range (fixes #462)
allow copy and assignment
add a shuffle function
p_value calculation now scales linearly with number of data points and
not N*logN as before

configure.ac: added a test to see if std::multiset::iterator is mutable

  • Property svn:eol-style set to native
  • Property svn:keywords set to Id
File size: 4.3 KB
Line 
1// $Id: KolmogorovSmirnov.cc 1701 2009-01-07 22:53:05Z peter $
2
3/*
4  Copyright (C) 2008, 2009 Peter Johansson
5
6  This file is part of the yat library, http://dev.thep.lu.se/yat
7
8  The yat library is free software; you can redistribute it and/or
9  modify it under the terms of the GNU General Public License as
10  published by the Free Software Foundation; either version 3 of the
11  License, or (at your option) any later version.
12
13  The yat library is distributed in the hope that it will be useful,
14  but WITHOUT ANY WARRANTY; without even the implied warranty of
15  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16  General Public License for more details.
17
18  You should have received a copy of the GNU General Public License
19  along with yat. If not, see <http://www.gnu.org/licenses/>.
20*/
21
22#include <config.h>
23
24#include "KolmogorovSmirnov.h"
25#include "yat/random/random.h"
26#include "yat/utility/stl_utility.h"
27
28#include <algorithm>
29#include <cassert>
30#include <cmath>
31#include <deque>
32#include <functional>
33#include <limits>
34
35namespace theplu {
36namespace yat {
37namespace statistics {
38
39
40  KolmogorovSmirnov::KolmogorovSmirnov(void)
41    : cached_(true), score_(0.0), sum_w1_(0.0), sum_w2_(0.0)
42  {
43  }
44
45
46  void KolmogorovSmirnov::add(double x, bool target, double w)
47  {
48    if (w==0) // ignoring zero weight data
49      return;
50
51    data_.insert(Element(x,target,w));
52    if (target){
53      sum_w1_+=w;
54    }
55    else {
56      sum_w2_+=w;
57    }
58    cached_=false;
59  }
60
61
62  double KolmogorovSmirnov::p_value(void) const
63  {
64    double res=0;
65    double res2=0;
66    double s2 = score() * score() * sum_w1_*sum_w2_/(sum_w1_+sum_w2_);
67    int sign = 1;
68    for (size_t k = 1; k<100; ++k) {
69      res += sign * 2 * exp(-2.0 * k * k * s2);
70      sign *= -1;
71      // ignore remaining terms as they are small
72      if (res==res2)
73        return res;
74      res2 = res;
75    }
76
77    return res;
78  }
79
80
81  double KolmogorovSmirnov::p_value(size_t perm) const
82  {
83    size_t count=0;
84    double score_threshold = score()-10*std::numeric_limits<double>().epsilon();
85    KolmogorovSmirnov ks(*this);
86
87    for (size_t i=0; i<perm; ++i){
88      ks.shuffle();
89      if (ks.score()>=score_threshold)
90        ++count;
91    }
92    return static_cast<double>(count)/perm;
93  }
94
95
96  void KolmogorovSmirnov::reset(void)
97  {
98    cached_=false;
99    data_.clear();
100    sum_w1_=0;
101    sum_w2_=0;
102  }
103
104
105  double KolmogorovSmirnov::score(void) const
106  {
107    return std::abs(signed_score());
108  }
109
110
111  void KolmogorovSmirnov::scores(std::vector<double>& res) const
112  {
113    res.clear();
114    res.reserve(data_.size());
115    data_w::const_iterator iter(data_.begin());
116    double f1=0;
117    double f2=0;
118    while(iter!=data_.end()){
119      size_t count=0;
120      double value = iter->value;
121      while (iter!=data_.end() && iter->value==value) {
122        if (iter->label) {
123          f1 += iter->weight;
124        }
125        else {
126          f2 += iter->weight;
127        }
128        ++count;
129        ++iter;
130      }
131      res.resize(res.size()+count, f1/sum_w1_-f2/sum_w2_);
132    }
133  }
134
135
136  void KolmogorovSmirnov::shuffle(void)
137  {
138    std::deque<bool> labels;
139    for (data_w::const_iterator iter=data_.begin(); 
140         iter!=data_.end(); ++iter) {
141      labels.push_back(iter->label);
142    }
143    random::random_shuffle(labels.begin(), labels.end());
144    std::deque<bool>::const_iterator label = labels.begin();
145    for (data_w::iterator iter=data_.begin(); 
146         iter!=data_.end(); ++iter, ++label) {
147      // label does not affect sorting of set, so modifying label is safe
148#ifdef MULTISET_ITERATOR_IS_MUTABLE
149      (*iter).label = *label;
150#else
151      // standard requires that multiset::iterator is modifiable, but
152      // some implementations do not fulfill the requirement
153      const_cast<Element&>(*iter).label = *label;
154#endif
155    }
156    sum_w1_ = 0;
157    sum_w2_ = 0;
158    add_sum_w(data_.begin(), data_.end());
159    cached_ = false;
160  }
161
162
163  double KolmogorovSmirnov::signed_score(void) const
164  {
165    if (cached_)
166      return score_;
167    if (!sum_w1_ || !sum_w2_)
168      return 0.0;
169    score_=0;
170    std::vector<double> v;
171    scores(v);
172    std::less<double> l;
173    utility::abs<double> a;
174    using utility::make_compose_f_gx_hy;
175    score_ = *std::max_element(v.begin(), v.end(), 
176                               make_compose_f_gx_hy(l, a, a));
177    cached_=true;
178    return score_;
179  }
180
181
182  std::ostream& operator<<(std::ostream& os, const KolmogorovSmirnov& ks)
183  {
184    std::vector<double> s;
185    ks.scores(s);
186    for (size_t i=0; i<s.size(); ++i)
187      os << i << "\t" << s[i] << "\n";
188    return os;
189  }
190
191
192  bool KolmogorovSmirnov::Element::operator<
193  (const KolmogorovSmirnov::Element rhs) const
194  {
195    return value<rhs.value;
196  }
197
198}}} // of namespace theplu yat statistics
Note: See TracBrowser for help on using the repository browser.