source: branches/0.10-stable/yat/statistics/KolmogorovSmirnov.cc @ 3014

Last change on this file since 3014 was 3014, checked in by Peter, 8 years ago

update copyright years

  • Property svn:eol-style set to native
  • Property svn:keywords set to Id
File size: 5.3 KB
Line 
1// $Id: KolmogorovSmirnov.cc 3014 2013-04-04 02:43:32Z peter $
2
3/*
4  Copyright (C) 2008 Jari Häkkinen, Peter Johansson
5  Copyright (C) 2009, 2011, 2012, 2013 Peter Johansson
6
7  This file is part of the yat library, http://dev.thep.lu.se/yat
8
9  The yat library is free software; you can redistribute it and/or
10  modify it under the terms of the GNU General Public License as
11  published by the Free Software Foundation; either version 3 of the
12  License, or (at your option) any later version.
13
14  The yat library is distributed in the hope that it will be useful,
15  but WITHOUT ANY WARRANTY; without even the implied warranty of
16  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17  General Public License for more details.
18
19  You should have received a copy of the GNU General Public License
20  along with yat. If not, see <http://www.gnu.org/licenses/>.
21*/
22
23#include <config.h>
24
25#include "KolmogorovSmirnov.h"
26#include "yat/random/random.h"
27#include "yat/utility/Exception.h"
28#include "yat/utility/stl_utility.h"
29
30#include <algorithm>
31#include <cassert>
32#include <cmath>
33#include <deque>
34#include <functional>
35#include <limits>
36#include <sstream>
37#include <vector>
38
39namespace theplu {
40namespace yat {
41namespace statistics {
42
43
44  KolmogorovSmirnov::Element::Element(void)
45  {}
46
47
48  KolmogorovSmirnov::Element::Element(double x, bool class_label, double w)
49    : value(x), label(class_label), weight(w)
50  {}
51
52
53  KolmogorovSmirnov::KolmogorovSmirnov(void)
54    : cached_(true), score_(0.0), sum_w1_(0.0), sum_w2_(0.0)
55  {
56  }
57
58
59  void KolmogorovSmirnov::add(double x, bool target, double w)
60  {
61    if (w==0) // ignoring zero weight data
62      return;
63
64    assert(!std::isnan(x));
65    data_.insert(Element(x,target,w));
66    if (target){
67      sum_w1_+=w;
68    }
69    else {
70      sum_w2_+=w;
71    }
72    cached_=false;
73  }
74
75
76  double KolmogorovSmirnov::p_value(void) const
77  {
78    double res=0;
79    double res2=0;
80    double s2 = score() * score() * sum_w1_*sum_w2_/(sum_w1_+sum_w2_);
81    int sign = 1;
82    for (size_t k = 1; k<100; ++k) {
83      res += sign * 2 * exp(-2.0 * k * k * s2);
84      sign *= -1;
85      // ignore remaining terms as they are small
86      if (res==res2)
87        return res;
88      res2 = res;
89    }
90
91    return res;
92  }
93
94
95  double KolmogorovSmirnov::p_value(size_t perm) const
96  {
97    size_t count=0;
98    double score_threshold = score()-10*std::numeric_limits<double>().epsilon();
99    KolmogorovSmirnov ks(*this);
100
101    for (size_t i=0; i<perm; ++i){
102      ks.shuffle();
103      if (ks.score()>=score_threshold)
104        ++count;
105    }
106    return static_cast<double>(count)/perm;
107  }
108
109
110  void KolmogorovSmirnov::remove(double value, bool class_label, double weight)
111  {
112    if (!weight)
113      return;
114    Element e(value, class_label, weight);
115    typedef std::multiset<Element>::iterator iterator;
116    std::pair<iterator, iterator> iter = data_.equal_range(e);
117    while (iter.first!=iter.second) {
118      if (iter.first->label==class_label && iter.first->weight==weight) {
119        if (class_label)
120          sum_w1_-=weight;
121        else
122          sum_w2_-=weight;
123        data_.erase(iter.first);
124        cached_=false;
125        return;
126      }
127      ++iter.first;
128    }
129    std::ostringstream ss;
130    ss << "KolmogorovSmirnov::remove: " << value << " " << class_label
131       << " " << weight;
132    throw utility::runtime_error(ss.str());
133  }
134
135
136  void KolmogorovSmirnov::reset(void)
137  {
138    cached_=false;
139    data_.clear();
140    sum_w1_=0;
141    sum_w2_=0;
142  }
143
144
145  double KolmogorovSmirnov::score(void) const
146  {
147    return std::abs(signed_score());
148  }
149
150
151  void KolmogorovSmirnov::scores(std::vector<double>& res) const
152  {
153    res.clear();
154    res.reserve(data_.size());
155    data_w::const_iterator iter(data_.begin());
156    double f1=0;
157    double f2=0;
158    while(iter!=data_.end()){
159      size_t count=0;
160      double value = iter->value;
161      while (iter!=data_.end() && iter->value==value) {
162        if (iter->label) {
163          f1 += iter->weight;
164        }
165        else {
166          f2 += iter->weight;
167        }
168        ++count;
169        ++iter;
170      }
171      res.resize(res.size()+count, f1/sum_w1_-f2/sum_w2_);
172    }
173  }
174
175
176  void KolmogorovSmirnov::shuffle(void)
177  {
178    std::deque<bool> labels;
179    for (data_w::const_iterator iter=data_.begin();
180         iter!=data_.end(); ++iter) {
181      labels.push_back(iter->label);
182    }
183    random::random_shuffle(labels.begin(), labels.end());
184    std::deque<bool>::const_iterator label = labels.begin();
185    for (data_w::iterator iter=data_.begin();
186         iter!=data_.end(); ++iter, ++label) {
187      // label does not affect sorting of set, so modifying label is safe
188#ifdef MULTISET_ITERATOR_IS_MUTABLE
189      (*iter).label = *label;
190#else
191      // standard requires that multiset::iterator is modifiable, but
192      // some implementations do not fulfill the requirement
193      const_cast<Element&>(*iter).label = *label;
194#endif
195    }
196    sum_w1_ = 0;
197    sum_w2_ = 0;
198    add_sum_w(data_.begin(), data_.end());
199    cached_ = false;
200  }
201
202
203  double KolmogorovSmirnov::signed_score(void) const
204  {
205    if (cached_)
206      return score_;
207    if (!sum_w1_ || !sum_w2_)
208      return 0.0;
209    score_=0;
210    std::vector<double> v;
211    scores(v);
212    std::less<double> l;
213    utility::abs<double> a;
214    using utility::make_compose_f_gx_hy;
215    score_ = *std::max_element(v.begin(), v.end(),
216                               make_compose_f_gx_hy(l, a, a));
217    cached_=true;
218    return score_;
219  }
220
221
222  std::ostream& operator<<(std::ostream& os, const KolmogorovSmirnov& ks)
223  {
224    std::vector<double> s;
225    ks.scores(s);
226    for (size_t i=0; i<s.size(); ++i)
227      os << i << "\t" << s[i] << "\n";
228    return os;
229  }
230
231
232  bool KolmogorovSmirnov::Element::operator<
233  (const KolmogorovSmirnov::Element& rhs) const
234  {
235    return value<rhs.value;
236  }
237
238}}} // of namespace theplu yat statistics
Note: See TracBrowser for help on using the repository browser.