source: trunk/c++_tools/statistics/ROC.h @ 648

Last change on this file since 648 was 648, checked in by Peter, 15 years ago

fixes #133 removed all errors reported from Doxygen. Only one error left which says Index is not documented but I don't want it to be documented actually we use the Doxygens preprocessor to skip documenting that class, yet Doxygen complains that class is not documented huh. Only solution would be to move that class to its own file and not keep it together with SVM.

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 4.7 KB
Line 
1#ifndef _theplu_statistics_roc_
2#define _theplu_statistics_roc_
3
4// $Id: ROC.h 648 2006-09-14 03:04:17Z peter $
5
6#include <c++_tools/classifier/Target.h>
7#include <c++_tools/statistics/Score.h>
8
9#include <utility>
10#include <vector>
11
12namespace theplu {
13  namespace utility {
14    class vector;
15  }
16namespace statistics { 
17
18  ///
19  /// Class for ROC (Reciever Operating Characteristic).
20  ///   
21  /// As the area under an ROC curve is equivalent to Mann-Whitney U
22  /// statistica, this class can be used to perform a Mann-Whitney
23  /// U-test (aka Wilcoxon).
24  ///
25  class ROC : public Score
26  {
27 
28  public:
29    ///
30    /// Default constructor
31    ///
32    ROC(bool absolute=true);
33         
34    ///
35    /// Destructor
36    ///
37    virtual ~ROC(void) {};
38         
39    /// Function taking \a value, \a target (+1 or -1) and vector
40    /// defining what samples to use. The score is equivalent to
41    /// Mann-Whitney statistics.
42    /// @return the area under the ROC curve. If the area is less
43    /// than 0.5 and absolute=true, 1-area is returned. Complexity is
44    /// \f$ N\log N \f$ where \f$ N \f$ is number of samples.
45    ///
46    double score(const classifier::Target& target, 
47                 const utility::vector& value); 
48   
49    /// Function taking values, target, weight and a vector defining
50    /// what samples to use. The area is defines as \f$ \frac{\sum
51    /// w^+w^-}{\sum w^+w^-}\f$, where the sum in the numerator goes
52    /// over all pairs where value+ is larger than value-. The
53    /// denominator goes over all pairs. If target is equal to 1,
54    /// sample belonges to class + otherwise sample belongs to class
55    /// -. @return wheighted version of area under the ROC curve. If
56    /// the area is less than 0.5 and absolute=true, 1-area is
57    /// returned. Complexity is \f$ N^2 \f$ where \f$ N \f$ is number
58    /// of samples.
59    ///
60    double score(const classifier::Target& target, 
61                 const classifier::DataLookupWeighted1D& value); 
62       
63
64    /// Function taking values, target, weight and a vector defining
65    /// what samples to use. The area is defines as \f$ \frac{\sum
66    /// w^+w^-}{\sum w^+w^-}\f$, where the sum in the numerator goes
67    /// over all pairs where value+ is larger than value-. The
68    /// denominator goes over all pairs. If target is equal to 1,
69    /// sample belonges to class + otherwise sample belongs to class
70    /// -. @return wheighted version of area under the ROC curve. If
71    /// the area is less than 0.5 and absolute=true, 1-area is
72    /// returned. Complexity is \f$ N^2 \f$ where \f$ N \f$ is number
73    /// of samples.
74    ///
75    double score(const classifier::Target& target, 
76                 const utility::vector& value, 
77                 const utility::vector& weight); 
78       
79
80    ///
81    ///Calculates the p-value, i.e. the probability of observing an
82    ///area equally or larger if the null hypothesis is true. If P is
83    ///near zero, this casts doubt on this hypothesis. The null
84    ///hypothesis is that the values from the 2 classes are generated
85    ///from 2 identical distributions. The alternative is that the
86    ///median of the first distribution is shifted from the median of
87    ///the second distribution by a non-zero amount. If the smallest
88    ///group size is larger than minimum_size (default = 10), then P
89    ///is calculated using a normal approximation.  @return the
90    ///one-sided p-value( if absolute true is used this is equivalent
91    ///to the two-sided p-value.)
92    ///
93    double p_value(void) const;
94   
95    ///
96    /// minimum_size is the threshold for when a normal
97    /// approximation is used for the p-value calculation.
98    ///
99    /// @return reference to minimum_size
100    ///
101    inline u_int& minimum_size(void){ return minimum_size_; } 
102
103    ///
104    /// Function returning true if target is positive (binary()) for
105    /// the sample with ith lowest data value, so i=0 corresponds to
106    /// the sample with the lowest data value and i=n()-1 the sample
107    /// with highest data value.
108    ///
109    bool target(const size_t i) const;
110
111    ///
112    /// @return number of samples
113    ///
114    inline size_t n(void) const { return vec_pair_.size(); }
115
116    ///
117    /// @return number of positive samples (Target.binary()==true)
118    ///
119    inline size_t n_pos(void) const { return nof_pos_; }
120
121  private:
122   
123    /// Implemented as in MatLab 13.1
124    double get_p_approx(const double) const;
125
126    /// Implemented as in MatLab 13.1
127    double get_p_exact(const double, const double, const double) const;
128
129    double area_;
130    u_int minimum_size_;
131    u_int nof_pos_;
132    std::vector<std::pair<bool, double> > vec_pair_; // class-value-pair
133  };
134
135  ///
136  /// The output operator for the ROC class. The output is an Nx2
137  /// matrix, where the first column is the sensitivity and second
138  /// is the specificity.
139  ///
140  std::ostream& operator<< (std::ostream& s, const ROC&);
141
142
143}} // of namespace statistics and namespace theplu
144
145#endif
146
Note: See TracBrowser for help on using the repository browser.