source: branches/0.13-stable/yat/statistics/ROC.h @ 3434

Last change on this file since 3434 was 3434, checked in by Peter, 7 years ago

closes #847

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 11.5 KB
Line 
1#ifndef _theplu_yat_statistics_roc_
2#define _theplu_yat_statistics_roc_
3
4// $Id: ROC.h 3434 2015-10-28 01:31:58Z peter $
5
6/*
7  Copyright (C) 2004 Peter Johansson
8  Copyright (C) 2005, 2006, 2007, 2008 Jari Häkkinen, Peter Johansson
9  Copyright (C) 2011, 2012, 2013 Peter Johansson
10
11  This file is part of the yat library, http://dev.thep.lu.se/yat
12
13  The yat library is free software; you can redistribute it and/or
14  modify it under the terms of the GNU General Public License as
15  published by the Free Software Foundation; either version 3 of the
16  License, or (at your option) any later version.
17
18  The yat library is distributed in the hope that it will be useful,
19  but WITHOUT ANY WARRANTY; without even the implied warranty of
20  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21  General Public License for more details.
22
23  You should have received a copy of the GNU General Public License
24  along with yat. If not, see <http://www.gnu.org/licenses/>.
25*/
26
27#include "Averager.h"
28#include "yat/utility/deprecate.h"
29#include "yat/utility/stl_utility.h"
30#include "yat/utility/yat_assert.h"
31
32#include <gsl/gsl_randist.h>
33
34#include <algorithm>
35#include <map>
36#include <utility>
37#include <vector>
38
39namespace theplu {
40namespace yat {
41namespace statistics {
42
43  ///
44  /// @brief Reciever Operating Characteristic.
45  ///
46  /// As the area under an ROC curve is equivalent to Mann-Whitney U
47  /// statistica, this class can be used to perform a Mann-Whitney
48  /// U-test (aka Wilcoxon).
49  ///
50  /// \see AUC
51  ///
52  class ROC
53  {
54
55  public:
56    ///
57    /// @brief Default constructor
58    ///
59    ROC(void);
60
61    /**
62       \brief Add a data value.
63
64       \param value data value
65       \param target \c true if value belongs to class positive
66
67       \param weight indicating how important the data point is. A
68       zero weight implies the data point is ignored. A negative
69       weight should be understood as removing a data point and thus
70       typically only makes sense if there is a previously added data
71       point with same \a value and \a target.
72
73    */
74    void add(double value, bool target, double weight=1.0);
75
76    /**
77       \brief Area Under Curve, AUC
78
79       \see AUC for how the area is calculated
80
81       @return Area under curve.
82    */
83    double area(void) const;
84
85    /**
86       \brief threshold for p_value calculation
87
88       Function can used to change the minimum_size.
89
90       \return reference to threshold minimum size
91     */
92    unsigned int& minimum_size(void);
93
94    /**
95       \brief threshold for p_value calculation
96
97       Threshold deciding whether p-value is computed using exact
98       method or a Gaussian approximation. If either number of positive
99       samples, n_pos(void), or number of negative samples,
100       n_neg(void), are smaller than minimum_size the exact method is
101       used.
102
103       \see p_value
104
105       \return const reference to minimum_size
106    */
107    const unsigned int& minimum_size(void) const;
108
109    ///
110    /// \brief number of samples
111    ///
112    /// @return sum of weights
113    ///
114    double n(void) const;
115
116    ///
117    /// \brief number of negative samples
118    ///
119    /// @return sum of weights with negative target
120    ///
121    double n_neg(void) const;
122
123    ///
124    /// \brief number of positive samples
125    ///
126    /// @return sum of weights with positive target
127    ///
128    double n_pos(void) const;
129
130
131    /**
132       Calculates the probability to get this area (or less).
133
134       \see p_right for more details
135     */
136    double p_left(void) const;
137
138    /**
139       \brief One-sided P-value
140
141       Calculates the one-sided p-value, i.e., probability to get this
142       area (or greater) given that there is no difference
143       between the two classes.
144
145       \b Exact \b method: In the exact method the function goes
146       through all permutations and counts what fraction for which the
147       area is greater (or equal) than area in original
148       permutation. In case all non-zero weights are not equal,
149       iterating through all permutations is not sufficient so
150       algorithm goes through all combinations instead which quickly
151       becomes a large number (N!).
152
153       \b Large-sample \b Approximation: When many data points are
154       available, see minimum_size(), a Gaussian approximation is used
155       and the p-value is calculated as
156       \f[
157       P = \frac{1}{\sqrt{2\pi}} \int_{-\infty}^z
158       \exp{\left(-\frac{t^2}{2}\right)} dt
159       \f]
160
161       where
162
163       \f[
164       z = \frac{\textrm{area} - 0.5 - 0.5/(n^+ \cdot n^-)}{s}
165       \f]
166
167       and
168
169       \f[
170       s^2 = \frac{n+1+\sum \left(n_x \cdot (n_x^2-1)\right)}
171       {12\cdot n^+\cdot n^-}
172       \f]
173
174       where sum runs over different data values (of ties) and \f$ n_x
175       \f$ is number data points with that value. The sum is a
176       correction term for ties and is zero if there are no ties.
177
178       The number of samples in a group, \f$ n^+ \f$, is calculated as
179       \f$ n = (\sum w)^2 / \sum w^2 \f$
180
181       \return \f$ P(a \ge \textrm{area}) \f$
182     */
183    double p_right(void) const;
184
185    /**
186       \deprecated Provided for backward compatibility with 0.10
187       API. Use p_right() instead.
188     */
189    double p_value_one_sided(void) const YAT_DEPRECATE;
190
191    /**
192       \brief Two-sided p-value.
193
194       Calculates the probability to get an area, \c a, equal or more
195       extreme than \c area
196       \f[
197       P(a \ge \textrm{max}(\textrm{area},1-\textrm{area})) +
198       P(a \le \textrm{min}(\textrm{area}, 1-\textrm{area})) \f]
199
200       If there are no ties, distribution of \a a is symmetric, so if
201       area is greater than 0.5, this boils down to \f$ P = 2*P(a \ge
202       \textrm{area}) = 2*P_\textrm{one-sided}\f$.
203
204       \return two-sided p-value
205
206       \see p_right
207    */
208    double p_value(void) const;
209
210    /**
211       \brief remove a data value
212
213       A data point with identical \a value, \a target, and \a weight
214       must have beed added prior calling this function; else an
215       exception is thrown.
216
217       \since New in yat 0.9
218     */
219    void remove(double value, bool target, double weight=1.0);
220
221    /**
222       @brief Set everything to zero
223    */
224    void reset(void);
225
226  private:
227    typedef std::multimap<double, std::pair<bool, double> > Map;
228    typedef std::vector<std::pair<bool, Map::mapped_type> > Vector;
229
230    // struct used in count functions
231    struct Weights
232    {
233      Weights(void);
234      double small_pos;
235      double small_neg;
236      double tied_pos;
237      double tied_neg;
238    };
239
240    /// Implemented as in MatLab 13.1
241    double get_p_approx(double) const;
242
243    /**
244       return false if all non-zero weights are equal
245     */
246    bool is_weighted(void) const;
247
248    /**
249       return (sum x)^2 / sum x^2
250     */
251    size_t nof_points(const Averager& a) const;
252
253    /*
254      Calculate probability to get an area equal (smaller) than \a
255      area given the distribution of weights and ties in multimap_
256     */
257    double p_left_weighted(double area) const;
258
259    /*
260      Calculate probability to get an area equal (greater) than \a
261      area given the distribution of weights and ties in multimap_
262     */
263    double p_right_weighted(double area) const;
264
265    /*
266      Count number of combinations (of N!) that gives weight sum equal
267      or larger than \a threshold.
268
269      Range [first, last) is used to check for ties. If, e.g., *first
270      and *(first+1) are equal implies that the two largest values are
271      equal.
272     */
273    template <typename Iterator>
274    double count(Iterator first, Iterator last, double threshold) const;
275
276    /*
277      Loop over all elements in \a weights and call count(7)
278     */
279    template <typename Iterator>
280    double count(Vector& weights, Iterator iter, Iterator last,
281                 double threshold, double sum, const Weights& weight) const;
282
283    /*
284      Count number of combinations in which sum>=threshold given
285      classes and weights in \a weight. Range [iter, last) is used to
286      handle ties.
287     */
288    template <typename Iterator>
289    double count(Vector& weights, Iterator iter, Iterator last,
290                 double threshold, double sum, Weights weight,
291                 const std::pair<bool, double>& entry) const;
292
293    /*
294      Calculates probability to get \a block number of pairs correctly
295      sorted when having \a pos positive samples and \a neg negative
296      samples given the distribution of ties as in [first, last).
297     */
298    template<typename ForwardIterator>
299    double p_exact_with_ties(ForwardIterator first, ForwardIterator last,
300                             double block, unsigned int pos,
301                             unsigned int neg) const;
302
303    /**
304       \return P(auc >= area)
305     */
306    double p_exact_right(double area) const;
307
308    /**
309       \return P(auc <= area)
310     */
311    double p_exact_left(double area) const;
312
313    bool use_exact_method(void) const;
314
315    mutable double area_;
316    bool has_ties_;
317    unsigned int minimum_size_;
318    Averager neg_weights_;
319    Averager pos_weights_;
320    Map multimap_;
321  };
322
323  template<typename ForwardIterator>
324  double
325  ROC::p_exact_with_ties(ForwardIterator begin, ForwardIterator end,
326                         double block, unsigned int pos,unsigned int neg) const
327  {
328    if (block <= 0)
329      return 1.0;
330    if (block > pos*neg)
331      return 0.0;
332
333    ForwardIterator iter(begin);
334    unsigned int n=0;
335    while (iter!=end && iter->first == begin->first) {
336      ++iter;
337      ++n;
338    }
339    double result = 0;
340    /*
341      pos1  neg1  |  n
342      pos2  neg2  |
343      ----  ----   ----
344      pos   neg
345     */
346
347    // ensure pos1 and neg2 are non-negative
348    unsigned int pos1 = n - std::min(n, neg);
349    // ensure pos2 and neg1 are non-negative
350    unsigned int max = std::min(n, pos);
351    YAT_ASSERT(pos1<=max);
352    for ( ; pos1<=max; ++pos1) {
353      unsigned int neg1 = n-pos1;
354      YAT_ASSERT(neg1<=n);
355      unsigned int pos2 = pos-pos1;
356      YAT_ASSERT(pos2<=pos);
357      unsigned int neg2 = neg-neg1;
358      YAT_ASSERT(neg2<=neg);
359      result += gsl_ran_hypergeometric_pdf(pos1, static_cast<unsigned int>(pos),
360                                           static_cast<unsigned int>(neg), n)
361        * p_exact_with_ties(iter, end,
362                            block - pos2*neg1 - 0.5*pos1*neg1,
363                            pos2, neg2);
364    }
365    return result;
366  }
367
368
369  template <typename Iterator>
370  double ROC::count(Iterator first, Iterator last, double threshold) const
371  {
372    Vector vec;
373    vec.reserve(multimap_.size());
374    // copy values from multimap_ to v
375    for (Map::const_iterator i = multimap_.begin(); i!=multimap_.end(); ++i)
376      vec.push_back(std::make_pair(false, i->second));
377
378    ROC::Weights w;
379    w.small_pos = pos_weights_.sum_x();
380    w.small_neg = neg_weights_.sum_x();
381    return count(vec, first, last, threshold*w.small_pos*w.small_neg, 0, w);
382  }
383
384
385
386  template <typename Iterator>
387  double ROC::count(ROC::Vector& v, Iterator iter, Iterator last,
388                    double threshold, double sum, const Weights& w) const
389  {
390    double result = 0.0;
391    // loop over all elements
392    int nof_elements = 0;
393    for (ROC::Vector::iterator i=v.begin(); i!=v.end(); ++i) {
394      if (i->first)
395        continue;
396      i->first = true;
397      result += count(v, iter, last, threshold, sum, w, i->second);
398      i->first = false;
399      ++nof_elements;
400    }
401    YAT_ASSERT(nof_elements);
402    return result/nof_elements;
403  }
404
405
406  template <typename Iterator>
407  double ROC::count(Vector& weights, Iterator iter, Iterator last,
408                    double threshold, double sum, Weights w,
409                    const std::pair<bool, double>& entry) const
410  {
411    double tiny = 10e-10;
412
413    Iterator next(iter);
414    YAT_ASSERT(next!=last);
415    ++next;
416
417    // update weights
418    if (entry.first) {
419      w.tied_pos += entry.second;
420      w.small_pos -= entry.second;
421    }
422    else {
423      w.tied_neg += entry.second;
424      w.small_neg -= entry.second;
425    }
426
427    // last entry in equal range
428    if (next==last || *next!=*iter) {
429      sum += 0.5*w.tied_pos*w.tied_neg + w.tied_pos * w.small_neg;
430      w.tied_pos=0;
431      w.tied_neg=0;
432    }
433
434    // max sum happens if all pos values belong to current equal range
435    // and none of the remaining neg values
436    double max_sum = sum + 0.5*(w.tied_pos+w.small_pos)*w.tied_neg +
437      (w.tied_pos+w.small_pos)*w.small_neg;
438
439    if (max_sum<threshold-tiny)
440      return 0.0;
441    if (sum + 0.5*w.tied_pos*(w.tied_neg+w.small_neg) >= threshold-tiny)
442      return 1.0;
443
444    if (next!=last)
445      return count(weights, next, last, threshold, sum, w);
446    return 0.0;
447  }
448
449}}} // of namespace statistics, yat, and theplu
450#endif
Note: See TracBrowser for help on using the repository browser.