source: trunk/yat/statistics/KolmogorovSmirnov.cc @ 2837

Last change on this file since 2837 was 2837, checked in by Peter, 11 years ago

avoid ancient inclusion guards around '#include <config.h>'

  • Property svn:eol-style set to native
  • Property svn:keywords set to Id
File size: 5.1 KB
Line 
1// $Id: KolmogorovSmirnov.cc 2837 2012-09-16 23:07:47Z peter $
2
3/*
4  Copyright (C) 2008 Jari Häkkinen, Peter Johansson
5  Copyright (C) 2009, 2011, 2012 Peter Johansson
6
7  This file is part of the yat library, http://dev.thep.lu.se/yat
8
9  The yat library is free software; you can redistribute it and/or
10  modify it under the terms of the GNU General Public License as
11  published by the Free Software Foundation; either version 3 of the
12  License, or (at your option) any later version.
13
14  The yat library is distributed in the hope that it will be useful,
15  but WITHOUT ANY WARRANTY; without even the implied warranty of
16  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17  General Public License for more details.
18
19  You should have received a copy of the GNU General Public License
20  along with yat. If not, see <http://www.gnu.org/licenses/>.
21*/
22
23#include <config.h>
24
25#include "KolmogorovSmirnov.h"
26#include "yat/random/random.h"
27#include "yat/utility/Exception.h"
28#include "yat/utility/stl_utility.h"
29
30#include <algorithm>
31#include <cassert>
32#include <cmath>
33#include <deque>
34#include <functional>
35#include <limits>
36#include <vector>
37
38namespace theplu {
39namespace yat {
40namespace statistics {
41
42
43  KolmogorovSmirnov::Element::Element(void)
44  {}
45
46
47  KolmogorovSmirnov::Element::Element(double x, bool class_label, double w)
48    : value(x), label(class_label), weight(w)
49  {}
50
51
52  KolmogorovSmirnov::KolmogorovSmirnov(void)
53    : cached_(true), score_(0.0), sum_w1_(0.0), sum_w2_(0.0)
54  {
55  }
56
57
58  void KolmogorovSmirnov::add(double x, bool target, double w)
59  {
60    if (w==0) // ignoring zero weight data
61      return;
62
63    assert(!std::isnan(x));
64    data_.insert(Element(x,target,w));
65    if (target){
66      sum_w1_+=w;
67    }
68    else {
69      sum_w2_+=w;
70    }
71    cached_=false;
72  }
73
74
75  double KolmogorovSmirnov::p_value(void) const
76  {
77    double res=0;
78    double res2=0;
79    double s2 = score() * score() * sum_w1_*sum_w2_/(sum_w1_+sum_w2_);
80    int sign = 1;
81    for (size_t k = 1; k<100; ++k) {
82      res += sign * 2 * exp(-2.0 * k * k * s2);
83      sign *= -1;
84      // ignore remaining terms as they are small
85      if (res==res2)
86        return res;
87      res2 = res;
88    }
89
90    return res;
91  }
92
93
94  double KolmogorovSmirnov::p_value(size_t perm) const
95  {
96    size_t count=0;
97    double score_threshold = score()-10*std::numeric_limits<double>().epsilon();
98    KolmogorovSmirnov ks(*this);
99
100    for (size_t i=0; i<perm; ++i){
101      ks.shuffle();
102      if (ks.score()>=score_threshold)
103        ++count;
104    }
105    return static_cast<double>(count)/perm;
106  }
107
108
109  void KolmogorovSmirnov::remove(double value, bool class_label, double weight)
110  {
111    if (!weight)
112      return;
113    Element e(value, class_label, weight);
114    typedef std::multiset<Element>::const_iterator iterator;
115    std::pair<iterator, iterator> iter = data_.equal_range(e);
116    while (iter.first!=iter.second) {
117      if (iter.first->label==class_label && iter.first->weight==weight) {
118        if (class_label)
119          sum_w1_-=weight;
120        else
121          sum_w2_-=weight;
122        cached_=false;
123        return;
124      }
125      ++iter.first;
126    }
127    throw utility::runtime_error("bajs");
128  }
129
130
131  void KolmogorovSmirnov::reset(void)
132  {
133    cached_=false;
134    data_.clear();
135    sum_w1_=0;
136    sum_w2_=0;
137  }
138
139
140  double KolmogorovSmirnov::score(void) const
141  {
142    return std::abs(signed_score());
143  }
144
145
146  void KolmogorovSmirnov::scores(std::vector<double>& res) const
147  {
148    res.clear();
149    res.reserve(data_.size());
150    data_w::const_iterator iter(data_.begin());
151    double f1=0;
152    double f2=0;
153    while(iter!=data_.end()){
154      size_t count=0;
155      double value = iter->value;
156      while (iter!=data_.end() && iter->value==value) {
157        if (iter->label) {
158          f1 += iter->weight;
159        }
160        else {
161          f2 += iter->weight;
162        }
163        ++count;
164        ++iter;
165      }
166      res.resize(res.size()+count, f1/sum_w1_-f2/sum_w2_);
167    }
168  }
169
170
171  void KolmogorovSmirnov::shuffle(void)
172  {
173    std::deque<bool> labels;
174    for (data_w::const_iterator iter=data_.begin();
175         iter!=data_.end(); ++iter) {
176      labels.push_back(iter->label);
177    }
178    random::random_shuffle(labels.begin(), labels.end());
179    std::deque<bool>::const_iterator label = labels.begin();
180    for (data_w::iterator iter=data_.begin();
181         iter!=data_.end(); ++iter, ++label) {
182      // label does not affect sorting of set, so modifying label is safe
183#ifdef MULTISET_ITERATOR_IS_MUTABLE
184      (*iter).label = *label;
185#else
186      // standard requires that multiset::iterator is modifiable, but
187      // some implementations do not fulfill the requirement
188      const_cast<Element&>(*iter).label = *label;
189#endif
190    }
191    sum_w1_ = 0;
192    sum_w2_ = 0;
193    add_sum_w(data_.begin(), data_.end());
194    cached_ = false;
195  }
196
197
198  double KolmogorovSmirnov::signed_score(void) const
199  {
200    if (cached_)
201      return score_;
202    if (!sum_w1_ || !sum_w2_)
203      return 0.0;
204    score_=0;
205    std::vector<double> v;
206    scores(v);
207    std::less<double> l;
208    utility::abs<double> a;
209    using utility::make_compose_f_gx_hy;
210    score_ = *std::max_element(v.begin(), v.end(),
211                               make_compose_f_gx_hy(l, a, a));
212    cached_=true;
213    return score_;
214  }
215
216
217  std::ostream& operator<<(std::ostream& os, const KolmogorovSmirnov& ks)
218  {
219    std::vector<double> s;
220    ks.scores(s);
221    for (size_t i=0; i<s.size(); ++i)
222      os << i << "\t" << s[i] << "\n";
223    return os;
224  }
225
226
227  bool KolmogorovSmirnov::Element::operator<
228  (const KolmogorovSmirnov::Element& rhs) const
229  {
230    return value<rhs.value;
231  }
232
233}}} // of namespace theplu yat statistics
Note: See TracBrowser for help on using the repository browser.