Changeset 821 for trunk/yat/statistics


Ignore:
Timestamp:
Mar 18, 2007, 5:00:05 PM (15 years ago)
Author:
Peter
Message:

Modified ROC class to use AUC class in calculation of ROC area. Refs #101

Location:
trunk/yat/statistics
Files:
4 edited

Legend:

Unmodified
Added
Removed
  • trunk/yat/statistics/AUC.cc

    r820 r821  
     1
    12// $Id$
    23
  • trunk/yat/statistics/AUC.h

    r820 r821  
    9595
    9696  private:
     97    friend class ROC;
     98
    9799    typedef std::multimap<double, std::pair<bool, double> > MultiMap;
    98100    double score(const MultiMap&) const;
  • trunk/yat/statistics/ROC.cc

    r781 r821  
    2323
    2424#include "ROC.h"
     25#include "AUC.h"
    2526#include "yat/classifier/DataLookupWeighted1D.h"
    2627#include "yat/classifier/Target.h"
     
    4041
    4142  ROC::ROC(void)
    42     : area_(0.5), minimum_size_(10), nof_pos_(0) 
     43    :minimum_size_(10)
    4344  {
     45    reset();
    4446  }
    4547
     
    4850  }
    4951
    50   double ROC::get_p_approx(const double area) const
     52
     53  void ROC::add(double x, bool target, double w)
    5154  {
    52     double x = area - 0.5;
     55    if (!w)
     56      return;
     57    multimap_.insert(std::make_pair(x, std::make_pair(target, w)));
     58    if (target)
     59      w_pos_+=w;
     60    else
     61      w_neg_+=w;
     62    area_=-1;
     63  }
     64
     65
     66  double ROC::area(void)
     67  {
     68    if (area_==-1){
     69      AUC auc(false);
     70      area_=auc.score(multimap_);
     71    }
     72    return area_;
     73  }
     74
     75
     76  double ROC::get_p_approx(double x) const
     77  {
     78    // make x standard normal
     79    x -= 0.5;
    5380    // Not integrating from the middle of the bin, but from the inner edge.
    5481    if (x>0)
    55       x -= 0.5/nof_pos_/(n()-nof_pos_);
     82      x -= 0.5/(n_pos()*n_neg());
    5683    else if(x<0)
    57       x += 0.5/nof_pos_/(n()-nof_pos_);
    58 
    59     double sigma = (std::sqrt( (n()-nof_pos_)*nof_pos_*
    60                                (n()+1.0)/12 ) /
    61                     ( n() - nof_pos_ ) / nof_pos_ );
    62     double p = gsl_cdf_gaussian_Q(x, sigma);
    63    
    64     return p;
     84      x += 0.5/(n_pos()*n_neg());
     85    else if (x==0)
     86      return 0.5;
     87    double sigma = std::sqrt( ( n()+1.0 )/(12*n_pos()*n_neg()) );
     88    return gsl_cdf_gaussian_Q(x, sigma);
    6589  }
    6690
     
    6892                          const double nof_neg) const
    6993  {
    70     double p;
    7194    if (block <= 0.0)
    72       p = 1.0;
    73     else if (block > nof_neg*nof_pos)
    74       p = 0.0;
    75     else {
    76       double p1 = get_p_exact(block-nof_neg, nof_pos-1, nof_neg);
    77       double p2 = get_p_exact(block, nof_pos, nof_neg-1);
    78       p = nof_pos/(nof_pos+nof_neg)*p1 + nof_neg/(nof_pos+nof_neg)*p2;
    79     }
    80     return p;
     95      return 1.0;
     96    if (block > nof_neg*nof_pos)
     97      return 0.0;
     98    double p1 = get_p_exact(block-nof_neg, nof_pos-1, nof_neg);
     99    double p2 = get_p_exact(block, nof_pos, nof_neg-1);
     100    return nof_pos/(nof_pos+nof_neg)*p1 + nof_neg/(nof_pos+nof_neg)*p2;
    81101  }
    82102
     
    86106  }
    87107
     108
     109  const u_int& ROC::minimum_size(void) const
     110  {
     111    return minimum_size_;
     112  }
     113
     114
    88115  size_t ROC::n(void) const
    89116  {
    90     return vec_pair_.size();
     117    return n_pos()+n_neg();
    91118  }
     119
     120
     121  size_t ROC::n_neg(void) const
     122  {
     123    return w_neg_;
     124  }
     125
    92126
    93127  size_t ROC::n_pos(void) const
    94128  {
    95     return nof_pos_;
    96   }
    97 
    98   double ROC::p_value(void) const
    99   {
    100     if (weighted_)
    101       return 1.0;
    102     else if (nof_pos_ < minimum_size_ & n()-nof_pos_ < minimum_size_)
    103       return get_p_exact(area_*nof_pos_*(n()-nof_pos_),
    104                           nof_pos_, n()-nof_pos_);
    105     else
    106       return get_p_approx(area_);
    107    
    108   }
    109 
    110   double ROC::score(const classifier::Target& target,
    111                     const utility::vector& value)
    112   {
    113     assert(target.size()==value.size());
    114     weighted_=false;
    115 
    116     vec_pair_.clear();
    117     vec_pair_.reserve(target.size());
    118     for (size_t i=0; i<target.size(); i++)
    119       vec_pair_.push_back(std::make_pair(target.binary(i),value(i)));
    120 
    121     std::sort(vec_pair_.begin(),vec_pair_.end(),
    122               utility::pair_value_compare<bool, double>());
    123     area_ = 0;
    124     nof_pos_=0;
    125     for (size_t i=0; i<n(); i++){
    126       if (vec_pair_[i].first){
    127         area_+=i;
    128         nof_pos_++;
    129       }
    130     }
    131 
    132     // Normalizing the area to [0,1]
    133     area_ = ( (area_-nof_pos_*(nof_pos_-1)/2 ) /
    134               (nof_pos_*(n()-nof_pos_)) );
    135    
    136     return area_;
     129    return w_pos_;
    137130  }
    138131
    139132
    140   // Peter, should be possible to do this in NlogN
    141   double ROC::score(const classifier::Target& target,
    142                     const classifier::DataLookupWeighted1D& value)
     133  double ROC::p_value() const
    143134  {
    144     weighted_=true;
    145 
    146     vec_pair_.clear();
    147     vec_pair_.reserve(target.size());
    148     for (unsigned int i=0; i<target.size(); i++)
    149       if (value.weight(i))
    150         vec_pair_.push_back(std::make_pair(target.binary(i),value.data(i)));
    151 
    152     std::sort(vec_pair_.begin(),vec_pair_.end(),
    153               utility::pair_value_compare<int, double>());
    154 
    155     area_=0;
    156     nof_pos_=0;
    157     double max_area=0;
    158 
    159     for (size_t i=0; i<n(); i++)
    160       if (target.binary(i))
    161         for (size_t j=0; j<n(); j++)
    162           if (!target.binary(j)){
    163             if (value.data(i)>value.data(j))
    164               area_+=value.weight(i)*value.weight(j);
    165             max_area+=value.weight(i)*value.weight(j);
    166           }
    167    
    168     area_/=max_area;
    169    
    170     return area_;
     135    double p = 2*p_value_one_sided();
     136    return std::min(p, 2-p);
    171137  }
    172138
    173139
    174   // Peter, should be possible to do this in NlogN
    175   double ROC::score(const classifier::Target& target,
    176                     const utility::vector& value,
    177                     const utility::vector& weight)
     140  double ROC::p_value_one_sided() const
    178141  {
    179     weighted_=true;
    180 
    181     vec_pair_.clear();
    182     vec_pair_.reserve(target.size());
    183     for (unsigned int i=0; i<target.size(); i++)
    184       if (weight(i))
    185         vec_pair_.push_back(std::make_pair(target.binary(i),value(i)));
    186 
    187     std::sort(vec_pair_.begin(),vec_pair_.end(),
    188               utility::pair_value_compare<int, double>());
    189 
    190     area_=0;
    191     nof_pos_=0;
    192     double max_area=0;
    193 
    194     for (size_t i=0; i<n(); i++)
    195       if (target.binary(i))
    196         for (size_t j=0; j<n(); j++)
    197           if (!target.binary(j)){
    198             if (value(i)>value(j))
    199               area_+=weight(i)*weight(j);
    200             max_area+=weight(i)*weight(j);
    201           }
    202    
    203     area_/=max_area;
    204    
    205     return area_;
     142    double area(area_);
     143    if (area_==-1){
     144      AUC auc(false);
     145      area = auc.score(multimap_);
     146    }
     147    if (n_pos() < minimum_size_ && n_neg() < minimum_size_) {
     148      // for small areas we calculate probabilitu to get larger area -
     149      // not larger or equal.
     150      if (area<0.5)
     151        return 1-get_p_exact((1-area)*n_pos()*n_neg(), n_pos(), n_neg());
     152      return get_p_exact(area*n_pos()*n_neg(), n_pos(), n_neg());
     153    }
     154    return get_p_approx(area);
    206155  }
    207156
    208   bool ROC::target(const size_t i) const
     157
     158  void ROC::reset(void)
    209159  {
    210     return vec_pair_[i].first;
    211   }
    212 
    213   std::ostream& operator<<(std::ostream& s, const ROC& r)
    214   {
    215     s.setf( std::ios::dec );
    216     s.precision(12);
    217     double sens = 1;
    218     double spec = 0;
    219     size_t n = r.n();
    220     double nof_pos = r.n_pos();
    221     for(size_t i=0; i<n-1; ++i) {
    222       s << sens << "\t";
    223       s << spec << "\n";
    224       if (r.target(i))
    225         spec -= 1/(n-nof_pos);
    226       else
    227         sens -= 1/nof_pos;
    228     }
    229     s << sens << "\t";
    230     s << spec ;
    231     return s;
     160    area_=-1;
     161    w_pos_=0;
     162    w_neg_=0;
     163    multimap_.clear();
    232164  }
    233165
  • trunk/yat/statistics/ROC.h

    r779 r821  
    2525*/
    2626
     27#include <algorithm>
     28#include <map>
    2729#include <utility>
    28 #include <vector>
    2930
    3031namespace theplu {
     
    6162    virtual ~ROC(void);
    6263         
     64    /**
     65       Adding a data value to ROC.
     66    */
     67    void add(double value, bool target, double weight=1.0);
     68
     69    /**
     70       The area is defines as \f$ \frac{\sum w^+w^-} {\sum w^+w^-}\f$,
     71       where the sum in the numerator goes over all pairs where value+
     72       is larger than value-. The denominator goes over all pairs.
     73
     74       @return Area under curve.
     75    */
     76    double area(void);
     77
    6378    ///
    6479    /// minimum_size is the threshold for when a normal
     
    6984    u_int& minimum_size(void);
    7085
     86    /**
     87       minimum_size is the threshold for when a normal
     88       approximation is used for the p-value calculation.
     89       
     90       @return const reference to minimum_size
     91    */
     92    const u_int& minimum_size(void) const;
     93
    7194    ///
    72     /// @return number of samples
     95    /// @return sum of weights
    7396    ///
    7497    size_t n(void) const;
    7598
    7699    ///
    77     /// @return number of positive samples (Target.binary()==true)
     100    /// @return sum of weights with negative target
     101    ///
     102    size_t n_neg(void) const;
     103
     104    ///
     105    /// @return sum of weights with positive target
    78106    ///
    79107    size_t n_pos(void) const;
     
    88116    ///the second distribution by a non-zero amount. If the smallest
    89117    ///group size is larger than minimum_size (default = 10), then P
    90     ///is calculated using a normal approximation.  @return the
    91     ///one-sided p-value( if absolute true is used this is equivalent
    92     ///to the two-sided p-value.)
     118    ///is calculated using a normal approximation. 
     119    ///
     120    /// \note Weights should be either zero or unity, else present
     121    /// implementation is nonsense.
     122    ///
     123    /// @return One-sided p-value.
    93124    ///
    94     double p_value(void) const;
     125    double p_value_one_sided(void) const;
    95126   
    96     /// Function taking \a value, \a target (+1 or -1) and vector
    97     /// defining what samples to use. The score is equivalent to
    98     /// Mann-Whitney statistics.
    99     /// @return the area under the ROC curve. If the area is less
    100     /// than 0.5 and absolute=true, 1-area is returned. Complexity is
    101     /// \f$ N\log N \f$ where \f$ N \f$ is number of samples.
    102     ///
    103     double score(const classifier::Target& target,
    104                  const utility::vector& value);
    105    
    106     /**
    107         Function taking values, target, weight and a vector defining
    108         what samples to use. The area is defines as \f$ \frac{\sum
    109         w^+w^-}{\sum w^+w^-}\f$, where the sum in the numerator goes
    110         over all pairs where value+ is larger than value-. The
    111         denominator goes over all pairs. If target is equal to 1,
    112         sample belonges to class + otherwise sample belongs to class
    113         -. @return wheighted version of area under the ROC curve. If
    114         the area is less than 0.5 and absolute=true, 1-area is
    115         returned. Complexity is \f$ N^2 \f$ where \f$ N \f$ is number
    116         of samples.
     127    /**
     128       @brief Two-sided p-value.
     129
     130       @return min(2*p_value_one_sided, 2-2*p_value_one_sided)
    117131    */
    118     double score(const classifier::Target& target,
    119                  const classifier::DataLookupWeighted1D& value);
     132    double p_value(void) const;
    120133
    121     /**
    122         Function taking values, target, weight and a vector defining
    123         what samples to use. The area is defines as \f$ \frac{\sum
    124         w^+w^-}{\sum w^+w^-}\f$, where the sum in the numerator goes
    125         over all pairs where value+ is larger than value-. The
    126         denominator goes over all pairs. If target is equal to 1,
    127         sample belonges to class + otherwise sample belongs to class
    128         -. @return wheighted version of area under the ROC curve. If
    129         the area is less than 0.5 and absolute=true, 1-area is
    130         returned. Complexity is \f$ N^2 \f$ where \f$ N \f$ is number
    131         of samples.
     134    /**
     135       @brief Set everything to zero
    132136    */
    133     double score(const classifier::Target& target,
    134                  const utility::vector& value,
    135                  const utility::vector& weight);
    136 
    137     ///
    138     /// Function returning true if target is positive (binary()) for
    139     /// the sample with ith lowest data value, so i=0 corresponds to
    140     /// the sample with the lowest data value and i=n()-1 the sample
    141     /// with highest data value.
    142     ///
    143     bool target(const size_t i) const;
     137    void reset(void);
    144138
    145139  private:
    146140   
    147141    /// Implemented as in MatLab 13.1
    148     double get_p_approx(const double) const;
     142    double get_p_approx(double) const;
    149143
    150144    /// Implemented as in MatLab 13.1
     
    153147    double area_;
    154148    u_int minimum_size_;
    155     u_int nof_pos_;
    156     std::vector<std::pair<bool, double> > vec_pair_; // class-value-pair
    157     bool weighted_;
     149    double w_neg_;
     150    double w_pos_;
     151    // <data pair<class, weight> >
     152    std::multimap<double, std::pair<bool, double> > multimap_;
    158153  };
    159 
    160   ///
    161   /// The output operator for the ROC class. The output is an Nx2
    162   /// matrix, where the first column is the sensitivity and second
    163   /// is the specificity.
    164   ///
    165   std::ostream& operator<< (std::ostream& s, const ROC&);
    166154
    167155}}} // of namespace statistics, yat, and theplu
Note: See TracChangeset for help on using the changeset viewer.