source: trunk/yat/statistics/ROC.h @ 767

Last change on this file since 767 was 767, checked in by Peter, 17 years ago

Fixes #65

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 5.4 KB
Line 
1#ifndef _theplu_yat_statistics_roc_
2#define _theplu_yat_statistics_roc_
3
4// $Id: ROC.h 767 2007-02-22 15:14:40Z peter $
5
6/*
7  Copyright (C) The authors contributing to this file.
8
9  This file is part of the yat library, http://lev.thep.lu.se/trac/yat
10
11  The yat library is free software; you can redistribute it and/or
12  modify it under the terms of the GNU General Public License as
13  published by the Free Software Foundation; either version 2 of the
14  License, or (at your option) any later version.
15
16  The yat library is distributed in the hope that it will be useful,
17  but WITHOUT ANY WARRANTY; without even the implied warranty of
18  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19  General Public License for more details.
20
21  You should have received a copy of the GNU General Public License
22  along with this program; if not, write to the Free Software
23  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
24  02111-1307, USA.
25*/
26
27#include "Score.h"
28
29#include <utility>
30#include <vector>
31
32namespace theplu {
33namespace yat {
34namespace classifier {
35  class Target;
36}
37namespace utility {
38  class vector;
39}
40namespace statistics { 
41
42  ///
43  /// @brief Class for Reciever Operating Characteristic.
44  ///   
45  /// As the area under an ROC curve is equivalent to Mann-Whitney U
46  /// statistica, this class can be used to perform a Mann-Whitney
47  /// U-test (aka Wilcoxon).
48  ///
49  class ROC : public Score
50  {
51 
52  public:
53    ///
54    /// @brief Default constructor
55    ///
56    ROC(bool absolute=true);
57         
58    ///
59    /// @brief The destructor
60    ///
61    virtual ~ROC(void);
62         
63    ///
64    /// minimum_size is the threshold for when a normal
65    /// approximation is used for the p-value calculation.
66    ///
67    /// @return reference to minimum_size
68    ///
69    u_int& minimum_size(void);
70
71    ///
72    /// @return number of samples
73    ///
74    size_t n(void) const;
75
76    ///
77    /// @return number of positive samples (Target.binary()==true)
78    ///
79    size_t n_pos(void) const;
80
81    ///
82    ///Calculates the p-value, i.e. the probability of observing an
83    ///area equally or larger if the null hypothesis is true. If P is
84    ///near zero, this casts doubt on this hypothesis. The null
85    ///hypothesis is that the values from the 2 classes are generated
86    ///from 2 identical distributions. The alternative is that the
87    ///median of the first distribution is shifted from the median of
88    ///the second distribution by a non-zero amount. If the smallest
89    ///group size is larger than minimum_size (default = 10), then P
90    ///is calculated using a normal approximation.  @return the
91    ///one-sided p-value( if absolute true is used this is equivalent
92    ///to the two-sided p-value.)
93    ///
94    double p_value(void) const;
95   
96    /// Function taking \a value, \a target (+1 or -1) and vector
97    /// defining what samples to use. The score is equivalent to
98    /// Mann-Whitney statistics.
99    /// @return the area under the ROC curve. If the area is less
100    /// than 0.5 and absolute=true, 1-area is returned. Complexity is
101    /// \f$ N\log N \f$ where \f$ N \f$ is number of samples.
102    ///
103    double score(const classifier::Target& target, 
104                 const utility::vector& value); 
105   
106    /**
107        Function taking values, target, weight and a vector defining
108        what samples to use. The area is defines as \f$ \frac{\sum
109        w^+w^-}{\sum w^+w^-}\f$, where the sum in the numerator goes
110        over all pairs where value+ is larger than value-. The
111        denominator goes over all pairs. If target is equal to 1,
112        sample belonges to class + otherwise sample belongs to class
113        -. @return wheighted version of area under the ROC curve. If
114        the area is less than 0.5 and absolute=true, 1-area is
115        returned. Complexity is \f$ N^2 \f$ where \f$ N \f$ is number
116        of samples.
117    */
118    double score(const classifier::Target& target, 
119                 const classifier::DataLookupWeighted1D& value); 
120
121    /**
122        Function taking values, target, weight and a vector defining
123        what samples to use. The area is defines as \f$ \frac{\sum
124        w^+w^-}{\sum w^+w^-}\f$, where the sum in the numerator goes
125        over all pairs where value+ is larger than value-. The
126        denominator goes over all pairs. If target is equal to 1,
127        sample belonges to class + otherwise sample belongs to class
128        -. @return wheighted version of area under the ROC curve. If
129        the area is less than 0.5 and absolute=true, 1-area is
130        returned. Complexity is \f$ N^2 \f$ where \f$ N \f$ is number
131        of samples.
132    */
133    double score(const classifier::Target& target, 
134                 const utility::vector& value, 
135                 const utility::vector& weight); 
136
137    ///
138    /// Function returning true if target is positive (binary()) for
139    /// the sample with ith lowest data value, so i=0 corresponds to
140    /// the sample with the lowest data value and i=n()-1 the sample
141    /// with highest data value.
142    ///
143    bool target(const size_t i) const;
144
145  private:
146   
147    /// Implemented as in MatLab 13.1
148    double get_p_approx(const double) const;
149
150    /// Implemented as in MatLab 13.1
151    double get_p_exact(const double, const double, const double) const;
152
153    double area_;
154    u_int minimum_size_;
155    u_int nof_pos_;
156    std::vector<std::pair<bool, double> > vec_pair_; // class-value-pair
157  };
158
159  ///
160  /// The output operator for the ROC class. The output is an Nx2
161  /// matrix, where the first column is the sensitivity and second
162  /// is the specificity.
163  ///
164  std::ostream& operator<< (std::ostream& s, const ROC&);
165
166}}} // of namespace statistics, yat, and theplu
167
168#endif
Note: See TracBrowser for help on using the repository browser.