source: trunk/yat/statistics/ROC.h @ 2592

Last change on this file since 2592 was 2592, checked in by Peter, 11 years ago

corrected includes. prefer including gsl before std.

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 4.8 KB
Line 
1#ifndef _theplu_yat_statistics_roc_
2#define _theplu_yat_statistics_roc_
3
4// $Id: ROC.h 2592 2011-10-27 23:21:40Z peter $
5
6/*
7  Copyright (C) 2004 Peter Johansson
8  Copyright (C) 2005, 2006, 2007, 2008 Jari Häkkinen, Peter Johansson
9  Copyright (C) 2011 Peter Johansson
10
11  This file is part of the yat library, http://dev.thep.lu.se/yat
12
13  The yat library is free software; you can redistribute it and/or
14  modify it under the terms of the GNU General Public License as
15  published by the Free Software Foundation; either version 3 of the
16  License, or (at your option) any later version.
17
18  The yat library is distributed in the hope that it will be useful,
19  but WITHOUT ANY WARRANTY; without even the implied warranty of
20  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21  General Public License for more details.
22
23  You should have received a copy of the GNU General Public License
24  along with yat. If not, see <http://www.gnu.org/licenses/>.
25*/
26
27#include <gsl/gsl_randist.h>
28
29#include <map>
30#include <utility>
31
32namespace theplu {
33namespace yat {
34namespace statistics { 
35
36  ///
37  /// @brief Class for Reciever Operating Characteristic.
38  ///   
39  /// As the area under an ROC curve is equivalent to Mann-Whitney U
40  /// statistica, this class can be used to perform a Mann-Whitney
41  /// U-test (aka Wilcoxon).
42  ///
43  class ROC
44  {
45 
46  public:
47    ///
48    /// @brief Default constructor
49    ///
50    ROC(void);
51         
52    /**
53       Adding a data value to ROC.
54    */
55    void add(double value, bool target, double weight=1.0);
56
57    /**
58       The area is defines as \f$ \frac{\sum w^+w^-} {\sum w^+w^-}\f$,
59       where the sum in the numerator goes over all pairs where value+
60       is larger than value-. The denominator goes over all pairs.
61
62       @return Area under curve.
63    */
64    double area(void);
65
66    ///
67    /// minimum_size is the threshold for when a normal
68    /// approximation is used for the p-value calculation.
69    ///
70    /// @return reference to minimum_size
71    ///
72    unsigned int& minimum_size(void);
73
74    /**
75       minimum_size is the threshold for when a normal
76       approximation is used for the p-value calculation.
77       
78       @return const reference to minimum_size
79    */
80    const unsigned int& minimum_size(void) const;
81
82    ///
83    /// @return sum of weights
84    ///
85    double n(void) const;
86
87    ///
88    /// @return sum of weights with negative target
89    ///
90    double n_neg(void) const;
91
92    ///
93    /// @return sum of weights with positive target
94    ///
95    double n_pos(void) const;
96
97    ///
98    ///Calculates the p-value, i.e. the probability of observing an
99    ///area equally or larger if the null hypothesis is true. If P is
100    ///near zero, this casts doubt on this hypothesis. The null
101    ///hypothesis is that the values from the 2 classes are generated
102    ///from 2 identical distributions. The alternative is that the
103    ///median of the first distribution is shifted from the median of
104    ///the second distribution by a non-zero amount. If the smallest
105    ///group size is larger than minimum_size (default = 10), then P
106    ///is calculated using a normal approximation. 
107    ///
108    /// \note Weights should be either zero or unity, else present
109    /// implementation is nonsense.
110    ///
111    /// @return One-sided p-value.
112    ///
113    double p_value_one_sided(void) const;
114   
115    /**
116       @brief Two-sided p-value.
117
118       @return min(2*p_value_one_sided, 2-2*p_value_one_sided)
119    */
120    double p_value(void) const;
121
122    /**
123       @brief Set everything to zero
124    */
125    void reset(void);
126
127  private:
128    typedef std::multimap<double, std::pair<bool, double> > Map;
129   
130    /// Implemented as in MatLab 13.1
131    double get_p_approx(double) const;
132
133    /*
134     */
135    template<typename ForwardIterator>
136    double p_exact_with_ties(ForwardIterator first, ForwardIterator last,
137                             double block, double pos, double neg) const;
138
139    /**
140       \return probability to get auc >= \a area. If area<0.5
141       probability to auc <= area is returned
142
143       \note assumes all non-zero weights are equal (typically unity
144       but not necessarily
145    */
146    double p_exact(double area) const;
147
148    bool use_exact_method(void) const;
149
150    double area_;
151    bool has_ties_;
152    unsigned int minimum_size_;
153    double w_neg_;
154    double w_pos_;
155    Map multimap_;
156  };
157
158  template<typename ForwardIterator>
159  double 
160  ROC::p_exact_with_ties(ForwardIterator begin, ForwardIterator end,
161                         double block, double pos, double neg) const
162  {
163    if (block <= 0)
164      return 1.0;
165    if (block > pos*neg)
166      return 0.0;
167
168    ForwardIterator iter(begin);
169    size_t n=0;
170    while (iter!=end && iter->first == begin->first) {
171      ++iter;
172      ++n;
173    }
174    double result = 0;
175    for (size_t i=0; i<=n; ++i) {
176      double pos1 = i;
177      double neg1 = n-i;
178      double pos2 = pos-pos1;
179      double neg2 = neg-neg1;
180      result += gsl_ran_hypergeometric_pdf(i, pos, neg, n)
181        * p_exact_with_ties(iter, end, 
182                            block - pos2*neg1 - 0.5*pos1*neg1, 
183                            pos2, neg2);
184    }
185    return result;
186  }
187
188}}} // of namespace statistics, yat, and theplu
189
190#endif
Note: See TracBrowser for help on using the repository browser.