source: trunk/yat/statistics/KolmogorovSmirnov.cc @ 1703

Last change on this file since 1703 was 1703, checked in by Peter, 12 years ago

correcting includes

  • Property svn:eol-style set to native
  • Property svn:keywords set to Id
File size: 4.3 KB
Line 
1// $Id: KolmogorovSmirnov.cc 1703 2009-01-08 16:35:28Z peter $
2
3/*
4  Copyright (C) 2008, 2009 Peter Johansson
5
6  This file is part of the yat library, http://dev.thep.lu.se/yat
7
8  The yat library is free software; you can redistribute it and/or
9  modify it under the terms of the GNU General Public License as
10  published by the Free Software Foundation; either version 3 of the
11  License, or (at your option) any later version.
12
13  The yat library is distributed in the hope that it will be useful,
14  but WITHOUT ANY WARRANTY; without even the implied warranty of
15  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16  General Public License for more details.
17
18  You should have received a copy of the GNU General Public License
19  along with yat. If not, see <http://www.gnu.org/licenses/>.
20*/
21
22#ifdef HAVE_CONFIG_H
23#include <config.h>
24#endif
25
26#include "KolmogorovSmirnov.h"
27#include "yat/random/random.h"
28#include "yat/utility/stl_utility.h"
29
30#include <algorithm>
31#include <cassert>
32#include <cmath>
33#include <deque>
34#include <functional>
35#include <limits>
36#include <vector>
37
38namespace theplu {
39namespace yat {
40namespace statistics {
41
42
43  KolmogorovSmirnov::KolmogorovSmirnov(void)
44    : cached_(true), score_(0.0), sum_w1_(0.0), sum_w2_(0.0)
45  {
46  }
47
48
49  void KolmogorovSmirnov::add(double x, bool target, double w)
50  {
51    if (w==0) // ignoring zero weight data
52      return;
53
54    data_.insert(Element(x,target,w));
55    if (target){
56      sum_w1_+=w;
57    }
58    else {
59      sum_w2_+=w;
60    }
61    cached_=false;
62  }
63
64
65  double KolmogorovSmirnov::p_value(void) const
66  {
67    double res=0;
68    double res2=0;
69    double s2 = score() * score() * sum_w1_*sum_w2_/(sum_w1_+sum_w2_);
70    int sign = 1;
71    for (size_t k = 1; k<100; ++k) {
72      res += sign * 2 * exp(-2.0 * k * k * s2);
73      sign *= -1;
74      // ignore remaining terms as they are small
75      if (res==res2)
76        return res;
77      res2 = res;
78    }
79
80    return res;
81  }
82
83
84  double KolmogorovSmirnov::p_value(size_t perm) const
85  {
86    size_t count=0;
87    double score_threshold = score()-10*std::numeric_limits<double>().epsilon();
88    KolmogorovSmirnov ks(*this);
89
90    for (size_t i=0; i<perm; ++i){
91      ks.shuffle();
92      if (ks.score()>=score_threshold)
93        ++count;
94    }
95    return static_cast<double>(count)/perm;
96  }
97
98
99  void KolmogorovSmirnov::reset(void)
100  {
101    cached_=false;
102    data_.clear();
103    sum_w1_=0;
104    sum_w2_=0;
105  }
106
107
108  double KolmogorovSmirnov::score(void) const
109  {
110    return std::abs(signed_score());
111  }
112
113
114  void KolmogorovSmirnov::scores(std::vector<double>& res) const
115  {
116    res.clear();
117    res.reserve(data_.size());
118    data_w::const_iterator iter(data_.begin());
119    double f1=0;
120    double f2=0;
121    while(iter!=data_.end()){
122      size_t count=0;
123      double value = iter->value;
124      while (iter!=data_.end() && iter->value==value) {
125        if (iter->label) {
126          f1 += iter->weight;
127        }
128        else {
129          f2 += iter->weight;
130        }
131        ++count;
132        ++iter;
133      }
134      res.resize(res.size()+count, f1/sum_w1_-f2/sum_w2_);
135    }
136  }
137
138
139  void KolmogorovSmirnov::shuffle(void)
140  {
141    std::deque<bool> labels;
142    for (data_w::const_iterator iter=data_.begin(); 
143         iter!=data_.end(); ++iter) {
144      labels.push_back(iter->label);
145    }
146    random::random_shuffle(labels.begin(), labels.end());
147    std::deque<bool>::const_iterator label = labels.begin();
148    for (data_w::iterator iter=data_.begin(); 
149         iter!=data_.end(); ++iter, ++label) {
150      // label does not affect sorting of set, so modifying label is safe
151#ifdef MULTISET_ITERATOR_IS_MUTABLE
152      (*iter).label = *label;
153#else
154      // standard requires that multiset::iterator is modifiable, but
155      // some implementations do not fulfill the requirement
156      const_cast<Element&>(*iter).label = *label;
157#endif
158    }
159    sum_w1_ = 0;
160    sum_w2_ = 0;
161    add_sum_w(data_.begin(), data_.end());
162    cached_ = false;
163  }
164
165
166  double KolmogorovSmirnov::signed_score(void) const
167  {
168    if (cached_)
169      return score_;
170    if (!sum_w1_ || !sum_w2_)
171      return 0.0;
172    score_=0;
173    std::vector<double> v;
174    scores(v);
175    std::less<double> l;
176    utility::abs<double> a;
177    using utility::make_compose_f_gx_hy;
178    score_ = *std::max_element(v.begin(), v.end(), 
179                               make_compose_f_gx_hy(l, a, a));
180    cached_=true;
181    return score_;
182  }
183
184
185  std::ostream& operator<<(std::ostream& os, const KolmogorovSmirnov& ks)
186  {
187    std::vector<double> s;
188    ks.scores(s);
189    for (size_t i=0; i<s.size(); ++i)
190      os << i << "\t" << s[i] << "\n";
191    return os;
192  }
193
194
195  bool KolmogorovSmirnov::Element::operator<
196  (const KolmogorovSmirnov::Element rhs) const
197  {
198    return value<rhs.value;
199  }
200
201}}} // of namespace theplu yat statistics
Note: See TracBrowser for help on using the repository browser.