source: trunk/yat/statistics/ROC.h @ 718

Last change on this file since 718 was 718, checked in by Jari Häkkinen, 15 years ago

Addresses #170.

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 5.4 KB
Line 
1#ifndef _theplu_yat_statistics_roc_
2#define _theplu_yat_statistics_roc_
3
4// $Id: ROC.h 718 2006-12-26 09:56:26Z jari $
5
6/*
7  Copyright (C) The authors contributing to this file.
8
9  This file is part of the yat library, http://lev.thep.lu.se/trac/yat
10
11  The yat library is free software; you can redistribute it and/or
12  modify it under the terms of the GNU General Public License as
13  published by the Free Software Foundation; either version 2 of the
14  License, or (at your option) any later version.
15
16  The yat library is distributed in the hope that it will be useful,
17  but WITHOUT ANY WARRANTY; without even the implied warranty of
18  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19  General Public License for more details.
20
21  You should have received a copy of the GNU General Public License
22  along with this program; if not, write to the Free Software
23  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
24  02111-1307, USA.
25*/
26
27#include "Score.h"
28#include "yat/classifier/Target.h"
29
30#include <utility>
31#include <vector>
32
33namespace theplu {
34namespace yat {
35  namespace utility {
36    class vector;
37  }
38namespace statistics { 
39
40  ///
41  /// Class for ROC (Reciever Operating Characteristic).
42  ///   
43  /// As the area under an ROC curve is equivalent to Mann-Whitney U
44  /// statistica, this class can be used to perform a Mann-Whitney
45  /// U-test (aka Wilcoxon).
46  ///
47  class ROC : public Score
48  {
49 
50  public:
51    ///
52    /// @brief Default constructor
53    ///
54    ROC(bool absolute=true);
55         
56    ///
57    /// @brief The destructor
58    ///
59    virtual ~ROC(void);
60         
61    ///
62    /// minimum_size is the threshold for when a normal
63    /// approximation is used for the p-value calculation.
64    ///
65    /// @return reference to minimum_size
66    ///
67    u_int& minimum_size(void);
68
69    ///
70    /// @return number of samples
71    ///
72    size_t n(void) const;
73
74    ///
75    /// @return number of positive samples (Target.binary()==true)
76    ///
77    size_t n_pos(void) const;
78
79    ///
80    ///Calculates the p-value, i.e. the probability of observing an
81    ///area equally or larger if the null hypothesis is true. If P is
82    ///near zero, this casts doubt on this hypothesis. The null
83    ///hypothesis is that the values from the 2 classes are generated
84    ///from 2 identical distributions. The alternative is that the
85    ///median of the first distribution is shifted from the median of
86    ///the second distribution by a non-zero amount. If the smallest
87    ///group size is larger than minimum_size (default = 10), then P
88    ///is calculated using a normal approximation.  @return the
89    ///one-sided p-value( if absolute true is used this is equivalent
90    ///to the two-sided p-value.)
91    ///
92    double p_value(void) const;
93   
94    /// Function taking \a value, \a target (+1 or -1) and vector
95    /// defining what samples to use. The score is equivalent to
96    /// Mann-Whitney statistics.
97    /// @return the area under the ROC curve. If the area is less
98    /// than 0.5 and absolute=true, 1-area is returned. Complexity is
99    /// \f$ N\log N \f$ where \f$ N \f$ is number of samples.
100    ///
101    double score(const classifier::Target& target, 
102                 const utility::vector& value); 
103   
104    /**
105        Function taking values, target, weight and a vector defining
106        what samples to use. The area is defines as \f$ \frac{\sum
107        w^+w^-}{\sum w^+w^-}\f$, where the sum in the numerator goes
108        over all pairs where value+ is larger than value-. The
109        denominator goes over all pairs. If target is equal to 1,
110        sample belonges to class + otherwise sample belongs to class
111        -. @return wheighted version of area under the ROC curve. If
112        the area is less than 0.5 and absolute=true, 1-area is
113        returned. Complexity is \f$ N^2 \f$ where \f$ N \f$ is number
114        of samples.
115    */
116    double score(const classifier::Target& target, 
117                 const classifier::DataLookupWeighted1D& value); 
118
119    /**
120        Function taking values, target, weight and a vector defining
121        what samples to use. The area is defines as \f$ \frac{\sum
122        w^+w^-}{\sum w^+w^-}\f$, where the sum in the numerator goes
123        over all pairs where value+ is larger than value-. The
124        denominator goes over all pairs. If target is equal to 1,
125        sample belonges to class + otherwise sample belongs to class
126        -. @return wheighted version of area under the ROC curve. If
127        the area is less than 0.5 and absolute=true, 1-area is
128        returned. Complexity is \f$ N^2 \f$ where \f$ N \f$ is number
129        of samples.
130    */
131    double score(const classifier::Target& target, 
132                 const utility::vector& value, 
133                 const utility::vector& weight); 
134
135    ///
136    /// Function returning true if target is positive (binary()) for
137    /// the sample with ith lowest data value, so i=0 corresponds to
138    /// the sample with the lowest data value and i=n()-1 the sample
139    /// with highest data value.
140    ///
141    bool target(const size_t i) const;
142
143  private:
144   
145    /// Implemented as in MatLab 13.1
146    double get_p_approx(const double) const;
147
148    /// Implemented as in MatLab 13.1
149    double get_p_exact(const double, const double, const double) const;
150
151    double area_;
152    u_int minimum_size_;
153    u_int nof_pos_;
154    std::vector<std::pair<bool, double> > vec_pair_; // class-value-pair
155  };
156
157  ///
158  /// The output operator for the ROC class. The output is an Nx2
159  /// matrix, where the first column is the sensitivity and second
160  /// is the specificity.
161  ///
162  std::ostream& operator<< (std::ostream& s, const ROC&);
163
164}}} // of namespace statistics, yat, and theplu
165
166#endif
Note: See TracBrowser for help on using the repository browser.