source: trunk/yat/statistics/KolmogorovSmirnov.cc @ 2014

Last change on this file since 2014 was 2014, checked in by Peter, 12 years ago

adding assert that NaN with finite weight are not allowd in KS statistics

  • Property svn:eol-style set to native
  • Property svn:keywords set to Id
File size: 4.4 KB
Line 
1// $Id: KolmogorovSmirnov.cc 2014 2009-07-31 14:31:24Z peter $
2
3/*
4  Copyright (C) 2008 Jari Häkkinen, Peter Johansson
5  Copyright (C) 2009 Peter Johansson
6
7  This file is part of the yat library, http://dev.thep.lu.se/yat
8
9  The yat library is free software; you can redistribute it and/or
10  modify it under the terms of the GNU General Public License as
11  published by the Free Software Foundation; either version 3 of the
12  License, or (at your option) any later version.
13
14  The yat library is distributed in the hope that it will be useful,
15  but WITHOUT ANY WARRANTY; without even the implied warranty of
16  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17  General Public License for more details.
18
19  You should have received a copy of the GNU General Public License
20  along with yat. If not, see <http://www.gnu.org/licenses/>.
21*/
22
23#ifdef HAVE_CONFIG_H
24#include <config.h>
25#endif
26
27#include "KolmogorovSmirnov.h"
28#include "yat/random/random.h"
29#include "yat/utility/stl_utility.h"
30
31#include <algorithm>
32#include <cassert>
33#include <cmath>
34#include <deque>
35#include <functional>
36#include <limits>
37#include <vector>
38
39namespace theplu {
40namespace yat {
41namespace statistics {
42
43
44  KolmogorovSmirnov::KolmogorovSmirnov(void)
45    : cached_(true), score_(0.0), sum_w1_(0.0), sum_w2_(0.0)
46  {
47  }
48
49
50  void KolmogorovSmirnov::add(double x, bool target, double w)
51  {
52    if (w==0) // ignoring zero weight data
53      return;
54
55    assert(!std::isnan(x));
56    data_.insert(Element(x,target,w));
57    if (target){
58      sum_w1_+=w;
59    }
60    else {
61      sum_w2_+=w;
62    }
63    cached_=false;
64  }
65
66
67  double KolmogorovSmirnov::p_value(void) const
68  {
69    double res=0;
70    double res2=0;
71    double s2 = score() * score() * sum_w1_*sum_w2_/(sum_w1_+sum_w2_);
72    int sign = 1;
73    for (size_t k = 1; k<100; ++k) {
74      res += sign * 2 * exp(-2.0 * k * k * s2);
75      sign *= -1;
76      // ignore remaining terms as they are small
77      if (res==res2)
78        return res;
79      res2 = res;
80    }
81
82    return res;
83  }
84
85
86  double KolmogorovSmirnov::p_value(size_t perm) const
87  {
88    size_t count=0;
89    double score_threshold = score()-10*std::numeric_limits<double>().epsilon();
90    KolmogorovSmirnov ks(*this);
91
92    for (size_t i=0; i<perm; ++i){
93      ks.shuffle();
94      if (ks.score()>=score_threshold)
95        ++count;
96    }
97    return static_cast<double>(count)/perm;
98  }
99
100
101  void KolmogorovSmirnov::reset(void)
102  {
103    cached_=false;
104    data_.clear();
105    sum_w1_=0;
106    sum_w2_=0;
107  }
108
109
110  double KolmogorovSmirnov::score(void) const
111  {
112    return std::abs(signed_score());
113  }
114
115
116  void KolmogorovSmirnov::scores(std::vector<double>& res) const
117  {
118    res.clear();
119    res.reserve(data_.size());
120    data_w::const_iterator iter(data_.begin());
121    double f1=0;
122    double f2=0;
123    while(iter!=data_.end()){
124      size_t count=0;
125      double value = iter->value;
126      while (iter!=data_.end() && iter->value==value) {
127        if (iter->label) {
128          f1 += iter->weight;
129        }
130        else {
131          f2 += iter->weight;
132        }
133        ++count;
134        ++iter;
135      }
136      res.resize(res.size()+count, f1/sum_w1_-f2/sum_w2_);
137    }
138  }
139
140
141  void KolmogorovSmirnov::shuffle(void)
142  {
143    std::deque<bool> labels;
144    for (data_w::const_iterator iter=data_.begin(); 
145         iter!=data_.end(); ++iter) {
146      labels.push_back(iter->label);
147    }
148    random::random_shuffle(labels.begin(), labels.end());
149    std::deque<bool>::const_iterator label = labels.begin();
150    for (data_w::iterator iter=data_.begin(); 
151         iter!=data_.end(); ++iter, ++label) {
152      // label does not affect sorting of set, so modifying label is safe
153#ifdef MULTISET_ITERATOR_IS_MUTABLE
154      (*iter).label = *label;
155#else
156      // standard requires that multiset::iterator is modifiable, but
157      // some implementations do not fulfill the requirement
158      const_cast<Element&>(*iter).label = *label;
159#endif
160    }
161    sum_w1_ = 0;
162    sum_w2_ = 0;
163    add_sum_w(data_.begin(), data_.end());
164    cached_ = false;
165  }
166
167
168  double KolmogorovSmirnov::signed_score(void) const
169  {
170    if (cached_)
171      return score_;
172    if (!sum_w1_ || !sum_w2_)
173      return 0.0;
174    score_=0;
175    std::vector<double> v;
176    scores(v);
177    std::less<double> l;
178    utility::abs<double> a;
179    using utility::make_compose_f_gx_hy;
180    score_ = *std::max_element(v.begin(), v.end(), 
181                               make_compose_f_gx_hy(l, a, a));
182    cached_=true;
183    return score_;
184  }
185
186
187  std::ostream& operator<<(std::ostream& os, const KolmogorovSmirnov& ks)
188  {
189    std::vector<double> s;
190    ks.scores(s);
191    for (size_t i=0; i<s.size(); ++i)
192      os << i << "\t" << s[i] << "\n";
193    return os;
194  }
195
196
197  bool KolmogorovSmirnov::Element::operator<
198  (const KolmogorovSmirnov::Element rhs) const
199  {
200    return value<rhs.value;
201  }
202
203}}} // of namespace theplu yat statistics
Note: See TracBrowser for help on using the repository browser.