source: branches/0.10-stable/yat/statistics/KolmogorovSmirnov.cc @ 2997

Last change on this file since 2997 was 2997, checked in by Peter, 9 years ago

improve error message

  • Property svn:eol-style set to native
  • Property svn:keywords set to Id
File size: 5.3 KB
Line 
1// $Id: KolmogorovSmirnov.cc 2997 2013-03-14 06:12:20Z peter $
2
3/*
4  Copyright (C) 2008 Jari Häkkinen, Peter Johansson
5  Copyright (C) 2009, 2011, 2012 Peter Johansson
6
7  This file is part of the yat library, http://dev.thep.lu.se/yat
8
9  The yat library is free software; you can redistribute it and/or
10  modify it under the terms of the GNU General Public License as
11  published by the Free Software Foundation; either version 3 of the
12  License, or (at your option) any later version.
13
14  The yat library is distributed in the hope that it will be useful,
15  but WITHOUT ANY WARRANTY; without even the implied warranty of
16  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17  General Public License for more details.
18
19  You should have received a copy of the GNU General Public License
20  along with yat. If not, see <http://www.gnu.org/licenses/>.
21*/
22
23#include <config.h>
24
25#include "KolmogorovSmirnov.h"
26#include "yat/random/random.h"
27#include "yat/utility/Exception.h"
28#include "yat/utility/stl_utility.h"
29
30#include <algorithm>
31#include <cassert>
32#include <cmath>
33#include <deque>
34#include <functional>
35#include <limits>
36#include <sstream>
37#include <vector>
38
39namespace theplu {
40namespace yat {
41namespace statistics {
42
43
44  KolmogorovSmirnov::Element::Element(void)
45  {}
46
47
48  KolmogorovSmirnov::Element::Element(double x, bool class_label, double w)
49    : value(x), label(class_label), weight(w)
50  {}
51
52
53  KolmogorovSmirnov::KolmogorovSmirnov(void)
54    : cached_(true), score_(0.0), sum_w1_(0.0), sum_w2_(0.0)
55  {
56  }
57
58
59  void KolmogorovSmirnov::add(double x, bool target, double w)
60  {
61    if (w==0) // ignoring zero weight data
62      return;
63
64    assert(!std::isnan(x));
65    data_.insert(Element(x,target,w));
66    if (target){
67      sum_w1_+=w;
68    }
69    else {
70      sum_w2_+=w;
71    }
72    cached_=false;
73  }
74
75
76  double KolmogorovSmirnov::p_value(void) const
77  {
78    double res=0;
79    double res2=0;
80    double s2 = score() * score() * sum_w1_*sum_w2_/(sum_w1_+sum_w2_);
81    int sign = 1;
82    for (size_t k = 1; k<100; ++k) {
83      res += sign * 2 * exp(-2.0 * k * k * s2);
84      sign *= -1;
85      // ignore remaining terms as they are small
86      if (res==res2)
87        return res;
88      res2 = res;
89    }
90
91    return res;
92  }
93
94
95  double KolmogorovSmirnov::p_value(size_t perm) const
96  {
97    size_t count=0;
98    double score_threshold = score()-10*std::numeric_limits<double>().epsilon();
99    KolmogorovSmirnov ks(*this);
100
101    for (size_t i=0; i<perm; ++i){
102      ks.shuffle();
103      if (ks.score()>=score_threshold)
104        ++count;
105    }
106    return static_cast<double>(count)/perm;
107  }
108
109
110  void KolmogorovSmirnov::remove(double value, bool class_label, double weight)
111  {
112    if (!weight)
113      return;
114    Element e(value, class_label, weight);
115    typedef std::multiset<Element>::const_iterator iterator;
116    std::pair<iterator, iterator> iter = data_.equal_range(e);
117    while (iter.first!=iter.second) {
118      if (iter.first->label==class_label && iter.first->weight==weight) {
119        if (class_label)
120          sum_w1_-=weight;
121        else
122          sum_w2_-=weight;
123        cached_=false;
124        return;
125      }
126      ++iter.first;
127    }
128    std::ostringstream ss;
129    ss << "KolmogorovSmirnov::remove: " << value << " " << class_label
130       << " " << weight;
131    throw utility::runtime_error(ss.str());
132  }
133
134
135  void KolmogorovSmirnov::reset(void)
136  {
137    cached_=false;
138    data_.clear();
139    sum_w1_=0;
140    sum_w2_=0;
141  }
142
143
144  double KolmogorovSmirnov::score(void) const
145  {
146    return std::abs(signed_score());
147  }
148
149
150  void KolmogorovSmirnov::scores(std::vector<double>& res) const
151  {
152    res.clear();
153    res.reserve(data_.size());
154    data_w::const_iterator iter(data_.begin());
155    double f1=0;
156    double f2=0;
157    while(iter!=data_.end()){
158      size_t count=0;
159      double value = iter->value;
160      while (iter!=data_.end() && iter->value==value) {
161        if (iter->label) {
162          f1 += iter->weight;
163        }
164        else {
165          f2 += iter->weight;
166        }
167        ++count;
168        ++iter;
169      }
170      res.resize(res.size()+count, f1/sum_w1_-f2/sum_w2_);
171    }
172  }
173
174
175  void KolmogorovSmirnov::shuffle(void)
176  {
177    std::deque<bool> labels;
178    for (data_w::const_iterator iter=data_.begin();
179         iter!=data_.end(); ++iter) {
180      labels.push_back(iter->label);
181    }
182    random::random_shuffle(labels.begin(), labels.end());
183    std::deque<bool>::const_iterator label = labels.begin();
184    for (data_w::iterator iter=data_.begin();
185         iter!=data_.end(); ++iter, ++label) {
186      // label does not affect sorting of set, so modifying label is safe
187#ifdef MULTISET_ITERATOR_IS_MUTABLE
188      (*iter).label = *label;
189#else
190      // standard requires that multiset::iterator is modifiable, but
191      // some implementations do not fulfill the requirement
192      const_cast<Element&>(*iter).label = *label;
193#endif
194    }
195    sum_w1_ = 0;
196    sum_w2_ = 0;
197    add_sum_w(data_.begin(), data_.end());
198    cached_ = false;
199  }
200
201
202  double KolmogorovSmirnov::signed_score(void) const
203  {
204    if (cached_)
205      return score_;
206    if (!sum_w1_ || !sum_w2_)
207      return 0.0;
208    score_=0;
209    std::vector<double> v;
210    scores(v);
211    std::less<double> l;
212    utility::abs<double> a;
213    using utility::make_compose_f_gx_hy;
214    score_ = *std::max_element(v.begin(), v.end(),
215                               make_compose_f_gx_hy(l, a, a));
216    cached_=true;
217    return score_;
218  }
219
220
221  std::ostream& operator<<(std::ostream& os, const KolmogorovSmirnov& ks)
222  {
223    std::vector<double> s;
224    ks.scores(s);
225    for (size_t i=0; i<s.size(); ++i)
226      os << i << "\t" << s[i] << "\n";
227    return os;
228  }
229
230
231  bool KolmogorovSmirnov::Element::operator<
232  (const KolmogorovSmirnov::Element& rhs) const
233  {
234    return value<rhs.value;
235  }
236
237}}} // of namespace theplu yat statistics
Note: See TracBrowser for help on using the repository browser.