Changeset 778 for trunk/yat


Ignore:
Timestamp:
Mar 4, 2007, 2:20:01 PM (16 years ago)
Author:
Peter
Message:

added ROCscore class. refs #101

Location:
trunk/yat/statistics
Files:
1 edited
2 copied

Legend:

Unmodified
Added
Removed
  • trunk/yat/statistics/Makefile.am

    r772 r778  
    2727  AveragerWeighted.cc AveragerPairWeighted.cc Distance.cc   \
    2828  Euclidean.cc Fisher.cc FoldChange.cc Histogram.cc Pearson.cc      \
    29   PearsonDistance.cc ROC.cc SAM.cc Score.cc SNR.cc tScore.cc  \
     29  PearsonDistance.cc ROC.cc ROCscore.cc \
     30  SAM.cc Score.cc SNR.cc tScore.cc  \
    3031  utility.cc WilcoxonFoldChange.cc
    3132
     
    3536  AveragerWeighted.h AveragerPairWeighted.h Distance.h Euclidean.h \
    3637  Fisher.h  \
    37   FoldChange.h Histogram.h Pearson.h PearsonDistance.h ROC.h  \
    38   SAM.h Score.h SNR.h tScore.h utility.h WilcoxonFoldChange.h
     38  FoldChange.h Histogram.h Pearson.h PearsonDistance.h ROC.h  ROCscore.h\
     39  SAM.h Score.h SNR.h tScore.h \
     40  utility.h WilcoxonFoldChange.h
  • trunk/yat/statistics/ROCscore.cc

    r776 r778  
    2222*/
    2323
    24 #include "ROC.h"
     24#include "ROCscore.h"
    2525#include "yat/classifier/DataLookupWeighted1D.h"
    2626#include "yat/classifier/Target.h"
    2727#include "yat/utility/stl_utility.h"
    2828#include "yat/utility/vector.h"
    29 
    30 #include <gsl/gsl_cdf.h>
    3129
    3230#include <cmath>
     
    3836namespace statistics { 
    3937
    40   ROC::ROC(bool b)
    41     : Score(b), area_(0.5), minimum_size_(10), nof_pos_(0) 
    42   {
    43   }
    44 
    45   ROC::~ROC(void)
    46   {
    47   }
    48 
    49   double ROC::get_p_approx(const double area) const
    50   {
    51     double x = area - 0.5;
    52     // Not integrating from the middle of the bin, but from the inner edge.
    53     if (x>0)
    54       x -= 0.5/nof_pos_/(n()-nof_pos_);
    55     else if(x<0)
    56       x += 0.5/nof_pos_/(n()-nof_pos_);
    57 
    58     double sigma = (std::sqrt( (n()-nof_pos_)*nof_pos_*
    59                                (n()+1.0)/12 ) /
    60                     ( n() - nof_pos_ ) / nof_pos_ );
    61     double p = gsl_cdf_gaussian_Q(x, sigma);
    62    
    63     return p;
    64   }
    65 
    66   double ROC::get_p_exact(const double block, const double nof_pos,
    67                           const double nof_neg) const
    68   {
    69     double p;
    70     if (block <= 0.0)
    71       p = 1.0;
    72     else if (block > nof_neg*nof_pos)
    73       p = 0.0;
    74     else {
    75       double p1 = get_p_exact(block-nof_neg, nof_pos-1, nof_neg);
    76       double p2 = get_p_exact(block, nof_pos, nof_neg-1);
    77       p = nof_pos/(nof_pos+nof_neg)*p1 + nof_neg/(nof_pos+nof_neg)*p2;
    78     }
    79     return p;
    80   }
    81 
    82   u_int& ROC::minimum_size(void)
    83   {
    84     return minimum_size_;
    85   }
    86 
    87   size_t ROC::n(void) const
    88   {
    89     return vec_pair_.size();
    90   }
    91 
    92   size_t ROC::n_pos(void) const
    93   {
    94     return nof_pos_;
    95   }
    96 
    97   double ROC::p_value(void) const
    98   {
    99     if (weighted_)
    100       return 1.0;
    101     else if (nof_pos_ < minimum_size_ & n()-nof_pos_ < minimum_size_)
    102       return get_p_exact(area_*nof_pos_*(n()-nof_pos_),
    103                           nof_pos_, n()-nof_pos_);
    104     else
    105       return get_p_approx(area_);
    106    
    107   }
    108 
    109   double ROC::score(const classifier::Target& target,
    110                     const utility::vector& value)
     38  double ROCscore::score(const classifier::Target& target,
     39                         const utility::vector& value)
    11140  {
    11241    assert(target.size()==value.size());
    11342    weighted_=false;
    11443
    115     vec_pair_.clear();
    116     vec_pair_.reserve(target.size());
     44    std::vector<std::pair<bool, double> > class_value; // class-value-pair
     45    class_value.reserve(target.size());
    11746    for (size_t i=0; i<target.size(); i++)
    118       vec_pair_.push_back(std::make_pair(target.binary(i),value(i)));
     47      class_value.push_back(std::make_pair(target.binary(i),value(i)));
    11948
    120     std::sort(vec_pair_.begin(),vec_pair_.end(),
     49    std::sort(class_value.begin(),class_value.end(),
    12150              utility::pair_value_compare<bool, double>());
    122     area_ = 0;
    123     nof_pos_=0;
    124     for (size_t i=0; i<n(); i++){
    125       if (vec_pair_[i].first){
    126         area_+=i;
    127         nof_pos_++;
     51    double area = 0;
     52    u_int nof_pos =0;
     53    for (size_t i=0; i<class_value.size(); i++){
     54      if (class_value[i].first){
     55        area+=i;
     56        ++nof_pos;
    12857      }
    12958    }
    13059
    13160    // Normalizing the area to [0,1]
    132     area_ = ( (area_-nof_pos_*(nof_pos_-1)/2 ) /
    133               (nof_pos_*(n()-nof_pos_)) );
     61    area = ( (area-nof_pos*(nof_pos-1)/2 ) /
     62             (nof_pos*(class_value.size()-nof_pos)) );
    13463   
    135     //Returning score larger 0.5 that you get by random
    136     if (area_<0.5 && absolute_)
    137       area_=1.0-area_;
    138 
    139     return area_;
     64    if (area<0.5 && absolute_)
     65      return 1.0-area;
     66    return area;
    14067  }
    14168
    14269
    14370  // Peter, should be possible to do this in NlogN
    144   double ROC::score(const classifier::Target& target,
     71  double ROCscore::score(const classifier::Target& target,
    14572                    const classifier::DataLookupWeighted1D& value)
    14673  {
    147     weighted_=true;
    14874
    149     vec_pair_.clear();
    150     vec_pair_.reserve(target.size());
     75    std::vector<std::pair<bool, double> > class_value; // class-value-pair
     76    class_value.clear();
     77    class_value.reserve(target.size());
    15178    for (unsigned int i=0; i<target.size(); i++)
    15279      if (value.weight(i))
    153         vec_pair_.push_back(std::make_pair(target.binary(i),value.data(i)));
     80        class_value.push_back(std::make_pair(target.binary(i),value.data(i)));
    15481
    155     std::sort(vec_pair_.begin(),vec_pair_.end(),
     82    std::sort(class_value.begin(),class_value.end(),
    15683              utility::pair_value_compare<int, double>());
    15784
    158     area_=0;
    159     nof_pos_=0;
     85    double area=0;
    16086    double max_area=0;
    16187
     
    16591          if (!target.binary(j)){
    16692            if (value.data(i)>value.data(j))
    167               area_+=value.weight(i)*value.weight(j);
     93              area+=value.weight(i)*value.weight(j);
    16894            max_area+=value.weight(i)*value.weight(j);
    16995          }
    17096   
    171     area_/=max_area;
     97    area/=max_area;
    17298   
    173     if (area_<0.5 && absolute_)
    174       area_=1.0-area_;
    175    
    176     return area_;
     99    if (area<0.5 && absolute_)
     100      return 1.0-area;
     101    return area;
    177102  }
    178103
    179104
    180105  // Peter, should be possible to do this in NlogN
    181   double ROC::score(const classifier::Target& target,
    182                     const utility::vector& value,
    183                     const utility::vector& weight)
     106  double ROCscore::score(const classifier::Target& target,
     107                         const utility::vector& value,
     108                         const utility::vector& weight)
    184109  {
    185110    weighted_=true;
    186111
    187     vec_pair_.clear();
    188     vec_pair_.reserve(target.size());
     112
     113    std::vector<std::pair<bool, double> > class_value; // class-value-pair
     114    class_value.reserve(target.size());
    189115    for (unsigned int i=0; i<target.size(); i++)
    190116      if (weight(i))
    191         vec_pair_.push_back(std::make_pair(target.binary(i),value(i)));
     117        class_value.push_back(std::make_pair(target.binary(i),value(i)));
    192118
    193     std::sort(vec_pair_.begin(),vec_pair_.end(),
     119    std::sort(class_value.begin(),class_value.end(),
    194120              utility::pair_value_compare<int, double>());
    195121
    196     area_=0;
    197     nof_pos_=0;
     122    double area=0;
    198123    double max_area=0;
    199124
     
    203128          if (!target.binary(j)){
    204129            if (value(i)>value(j))
    205               area_+=weight(i)*weight(j);
     130              area+=weight(i)*weight(j);
    206131            max_area+=weight(i)*weight(j);
    207132          }
    208133   
    209     area_/=max_area;
     134    area/=max_area;
    210135   
    211     if (area_<0.5 && absolute_)
    212       area_=1.0-area_;
    213    
    214     return area_;
    215   }
    216 
    217   bool ROC::target(const size_t i) const
    218   {
    219     return vec_pair_[i].first;
    220   }
    221 
    222   std::ostream& operator<<(std::ostream& s, const ROC& r)
    223   {
    224     s.setf( std::ios::dec );
    225     s.precision(12);
    226     double sens = 1;
    227     double spec = 0;
    228     size_t n = r.n();
    229     double nof_pos = r.n_pos();
    230     for(size_t i=0; i<n-1; ++i) {
    231       s << sens << "\t";
    232       s << spec << "\n";
    233       if (r.target(i))
    234         spec -= 1/(n-nof_pos);
    235       else
    236         sens -= 1/nof_pos;
    237     }
    238     s << sens << "\t";
    239     s << spec ;
    240     return s;
     136    if (area<0.5 && absolute_)
     137      return 1.0-area;
     138    return area;
    241139  }
    242140
  • trunk/yat/statistics/ROCscore.h

    r776 r778  
    1 #ifndef _theplu_yat_statistics_roc_
    2 #define _theplu_yat_statistics_roc_
     1#ifndef _theplu_yat_statistics_roc_score_
     2#define _theplu_yat_statistics_roc_score_
    33
    44// $Id$
     
    2727#include "Score.h"
    2828
    29 #include <utility>
    30 #include <vector>
     29#include <cctype>
    3130
    3231namespace theplu {
     
    4342  /// @brief Class for Reciever Operating Characteristic.
    4443  ///   
    45   /// As the area under an ROC curve is equivalent to Mann-Whitney U
    46   /// statistica, this class can be used to perform a Mann-Whitney
    47   /// U-test (aka Wilcoxon).
    48   ///
    49   class ROC : public Score
     44  class ROCscore : public Score
    5045  {
    5146 
    5247  public:
    53     ///
    54     /// @brief Default constructor
    55     ///
    56     ROC(bool absolute=true);
    57          
    58     ///
    59     /// @brief The destructor
    60     ///
    61     virtual ~ROC(void);
    62          
    63     ///
    64     /// minimum_size is the threshold for when a normal
    65     /// approximation is used for the p-value calculation.
    66     ///
    67     /// @return reference to minimum_size
    68     ///
    69     u_int& minimum_size(void);
    70 
    7148    ///
    7249    /// @return number of samples
     
    7956    size_t n_pos(void) const;
    8057
    81     ///
    82     ///Calculates the p-value, i.e. the probability of observing an
    83     ///area equally or larger if the null hypothesis is true. If P is
    84     ///near zero, this casts doubt on this hypothesis. The null
    85     ///hypothesis is that the values from the 2 classes are generated
    86     ///from 2 identical distributions. The alternative is that the
    87     ///median of the first distribution is shifted from the median of
    88     ///the second distribution by a non-zero amount. If the smallest
    89     ///group size is larger than minimum_size (default = 10), then P
    90     ///is calculated using a normal approximation.  @return the
    91     ///one-sided p-value( if absolute true is used this is equivalent
    92     ///to the two-sided p-value.)
    93     ///
    94     double p_value(void) const;
    95    
    9658    /// Function taking \a value, \a target (+1 or -1) and vector
    9759    /// defining what samples to use. The score is equivalent to
     
    13597                 const utility::vector& weight);
    13698
    137     ///
    138     /// Function returning true if target is positive (binary()) for
    139     /// the sample with ith lowest data value, so i=0 corresponds to
    140     /// the sample with the lowest data value and i=n()-1 the sample
    141     /// with highest data value.
    142     ///
    143     bool target(const size_t i) const;
    144 
    14599  private:
    146100   
    147     /// Implemented as in MatLab 13.1
    148     double get_p_approx(const double) const;
    149 
    150     /// Implemented as in MatLab 13.1
    151     double get_p_exact(const double, const double, const double) const;
    152 
    153     double area_;
    154     u_int minimum_size_;
    155     u_int nof_pos_;
    156     std::vector<std::pair<bool, double> > vec_pair_; // class-value-pair
    157101  };
    158 
    159   ///
    160   /// The output operator for the ROC class. The output is an Nx2
    161   /// matrix, where the first column is the sensitivity and second
    162   /// is the specificity.
    163   ///
    164   std::ostream& operator<< (std::ostream& s, const ROC&);
    165102
    166103}}} // of namespace statistics, yat, and theplu
Note: See TracChangeset for help on using the changeset viewer.