source: trunk/yat/statistics/ROC.h @ 2594

Last change on this file since 2594 was 2594, checked in by Peter, 11 years ago

improve docs for ROC and sister class AUC. closes #144

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 6.3 KB
Line 
1#ifndef _theplu_yat_statistics_roc_
2#define _theplu_yat_statistics_roc_
3
4// $Id: ROC.h 2594 2011-10-30 02:36:17Z peter $
5
6/*
7  Copyright (C) 2004 Peter Johansson
8  Copyright (C) 2005, 2006, 2007, 2008 Jari Häkkinen, Peter Johansson
9  Copyright (C) 2011 Peter Johansson
10
11  This file is part of the yat library, http://dev.thep.lu.se/yat
12
13  The yat library is free software; you can redistribute it and/or
14  modify it under the terms of the GNU General Public License as
15  published by the Free Software Foundation; either version 3 of the
16  License, or (at your option) any later version.
17
18  The yat library is distributed in the hope that it will be useful,
19  but WITHOUT ANY WARRANTY; without even the implied warranty of
20  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21  General Public License for more details.
22
23  You should have received a copy of the GNU General Public License
24  along with yat. If not, see <http://www.gnu.org/licenses/>.
25*/
26
27#include <gsl/gsl_randist.h>
28
29#include <map>
30#include <utility>
31
32namespace theplu {
33namespace yat {
34namespace statistics { 
35
36  ///
37  /// @brief Reciever Operating Characteristic.
38  ///
39  /// As the area under an ROC curve is equivalent to Mann-Whitney U
40  /// statistica, this class can be used to perform a Mann-Whitney
41  /// U-test (aka Wilcoxon).
42  ///
43  /// \see AUC
44  ///
45  class ROC
46  {
47 
48  public:
49    ///
50    /// @brief Default constructor
51    ///
52    ROC(void);
53         
54    /**
55       \brief Add a data value.
56
57       \param value data value
58       \param target \c true if value belongs to class positive
59
60       \param weight indicating how important the data point is. A
61       zero weight implies the data point is ignored. A negative
62       weight should be understood as removing a data point and thus
63       typically only makes sense if there is a previously added data
64       point with same \a value and \a target.
65
66    */
67    void add(double value, bool target, double weight=1.0);
68
69    /**
70       \brief Area Under Curve, AUC
71
72       \see AUC for how the area is calculated
73
74       @return Area under curve.
75    */
76    double area(void);
77
78    /**
79       \brief threshold for p_value calculation
80
81       Function can used to change the minimum_size.
82
83       \return reference to threshold minimum size
84     */
85    unsigned int& minimum_size(void);
86
87    /**
88       \brief threshold for p_value calculation
89
90       Threshold deciding whether p-value is computed using exact
91       method or a Gaussian approximation. If both number of positive
92       samples, n_pos(void), and number of negative samples,
93       n_neg(void), are smaller than minimum_size the exact method is
94       used.
95       
96       \see p_value
97
98       \return const reference to minimum_size
99    */
100    const unsigned int& minimum_size(void) const;
101
102    ///
103    /// \brief number of samples
104    ///
105    /// @return sum of weights
106    ///
107    double n(void) const;
108
109    ///
110    /// \brief number of negative samples
111    ///
112    /// @return sum of weights with negative target
113    ///
114    double n_neg(void) const;
115
116    ///
117    /// \brief number of positive samples
118    ///
119    /// @return sum of weights with positive target
120    ///
121    double n_pos(void) const;
122
123    /**
124       \brief One-sided P-value
125
126       Calculates the one-sided p-value, i.e., probability to get this
127       area (or greater) given that there is no difference
128       between the two classes.
129
130       \b Exact \b method: In the exact method the function goes
131       through all permutations and counts what fraction for which the
132       area is greater (or equal) than area in original permutation.
133
134       \b Large-sample \b Approximation: When many data points are
135       available, see minimum_size(), a Gaussian approximation is used
136       and the p-value is calculated as
137       \f[
138       P = \frac{1}{\sqrt{2\pi}} \int_{-\infty}^z
139       \exp{\left(-\frac{t^2}{2}\right)} dt
140       \f]
141
142       where
143
144       \f[
145       z = \frac{\textrm{area} - 0.5 - 0.5/(n^\cdot +n^-)}{s}
146       \f]
147
148       and
149
150       \f[
151       s^2 = \frac{n+1+\sum \left(n_x \cdot (n_x^2-1)\right)}
152       {12\cdot n^+\cdot n^-}
153       \f]
154
155       where sum runs over different data values (of ties) and \f$ n_x
156       \f$ is number data points with that value. The sum i a
157       correction term for ties and is zero if there are no ties.
158
159       \return \f$ P(a \ge \textrm{area}) \f$
160
161       \note Weights should be -1, 0, or 1; otherwise the p-value is
162       undefined and may change in future versions.
163     */
164    double p_value_one_sided(void) const;
165   
166    /**
167       \brief Two-sided p-value.
168
169       Calculates the probability to get an area, \c a, equal or more
170       extreme than \c area
171       \f[
172       P(a \ge \textrm{max}(\textrm{area},1-\textrm{area})) +
173       P(a \le \textrm{min}(\textrm{area}, 1-\textrm{area})) \f]
174
175       If there are no ties, distribution of \a a is symmetric, so if
176       area is greater than 0.5, this boils down to \f$ P = 2*P(a \ge
177       \textrm{area}) = 2*P_\textrm{one-sided}\f$.
178
179       \return two-sided p-value
180
181       \see p_value_one_sided
182    */
183    double p_value(void) const;
184
185    /**
186       @brief Set everything to zero
187    */
188    void reset(void);
189
190  private:
191    typedef std::multimap<double, std::pair<bool, double> > Map;
192   
193    /// Implemented as in MatLab 13.1
194    double get_p_approx(double) const;
195
196    /*
197     */
198    template<typename ForwardIterator>
199    double p_exact_with_ties(ForwardIterator first, ForwardIterator last,
200                             double block, double pos, double neg) const;
201
202    /**
203       \return probability to get auc >= \a area. If area<0.5
204       probability to auc <= area is returned
205
206       \note assumes all non-zero weights are equal (typically unity
207       but not necessarily
208    */
209    double p_exact(double area) const;
210
211    bool use_exact_method(void) const;
212
213    double area_;
214    bool has_ties_;
215    unsigned int minimum_size_;
216    double w_neg_;
217    double w_pos_;
218    Map multimap_;
219  };
220
221  template<typename ForwardIterator>
222  double 
223  ROC::p_exact_with_ties(ForwardIterator begin, ForwardIterator end,
224                         double block, double pos, double neg) const
225  {
226    if (block <= 0)
227      return 1.0;
228    if (block > pos*neg)
229      return 0.0;
230
231    ForwardIterator iter(begin);
232    size_t n=0;
233    while (iter!=end && iter->first == begin->first) {
234      ++iter;
235      ++n;
236    }
237    double result = 0;
238    for (size_t i=0; i<=n; ++i) {
239      double pos1 = i;
240      double neg1 = n-i;
241      double pos2 = pos-pos1;
242      double neg2 = neg-neg1;
243      result += gsl_ran_hypergeometric_pdf(i, pos, neg, n)
244        * p_exact_with_ties(iter, end, 
245                            block - pos2*neg1 - 0.5*pos1*neg1, 
246                            pos2, neg2);
247    }
248    return result;
249  }
250
251}}} // of namespace statistics, yat, and theplu
252
253#endif
Note: See TracBrowser for help on using the repository browser.