# Changeset 821 for trunk/yat/statistics

Ignore:
Timestamp:
Mar 18, 2007, 5:00:05 PM (15 years ago)
Message:

Modified ROC class to use AUC class in calculation of ROC area. Refs #101

Location:
trunk/yat/statistics
Files:
4 edited

Unmodified
Removed
• ## trunk/yat/statistics/AUC.cc

 r820 // $Id$
• ## trunk/yat/statistics/AUC.h

 r820 private: friend class ROC; typedef std::multimap > MultiMap; double score(const MultiMap&) const;
• ## trunk/yat/statistics/ROC.cc

 r781 #include "ROC.h" #include "AUC.h" #include "yat/classifier/DataLookupWeighted1D.h" #include "yat/classifier/Target.h" ROC::ROC(void) : area_(0.5), minimum_size_(10), nof_pos_(0) :minimum_size_(10) { reset(); } } double ROC::get_p_approx(const double area) const void ROC::add(double x, bool target, double w) { double x = area - 0.5; if (!w) return; multimap_.insert(std::make_pair(x, std::make_pair(target, w))); if (target) w_pos_+=w; else w_neg_+=w; area_=-1; } double ROC::area(void) { if (area_==-1){ AUC auc(false); area_=auc.score(multimap_); } return area_; } double ROC::get_p_approx(double x) const { // make x standard normal x -= 0.5; // Not integrating from the middle of the bin, but from the inner edge. if (x>0) x -= 0.5/nof_pos_/(n()-nof_pos_); x -= 0.5/(n_pos()*n_neg()); else if(x<0) x += 0.5/nof_pos_/(n()-nof_pos_); double sigma = (std::sqrt( (n()-nof_pos_)*nof_pos_* (n()+1.0)/12 ) / ( n() - nof_pos_ ) / nof_pos_ ); double p = gsl_cdf_gaussian_Q(x, sigma); return p; x += 0.5/(n_pos()*n_neg()); else if (x==0) return 0.5; double sigma = std::sqrt( ( n()+1.0 )/(12*n_pos()*n_neg()) ); return gsl_cdf_gaussian_Q(x, sigma); } const double nof_neg) const { double p; if (block <= 0.0) p = 1.0; else if (block > nof_neg*nof_pos) p = 0.0; else { double p1 = get_p_exact(block-nof_neg, nof_pos-1, nof_neg); double p2 = get_p_exact(block, nof_pos, nof_neg-1); p = nof_pos/(nof_pos+nof_neg)*p1 + nof_neg/(nof_pos+nof_neg)*p2; } return p; return 1.0; if (block > nof_neg*nof_pos) return 0.0; double p1 = get_p_exact(block-nof_neg, nof_pos-1, nof_neg); double p2 = get_p_exact(block, nof_pos, nof_neg-1); return nof_pos/(nof_pos+nof_neg)*p1 + nof_neg/(nof_pos+nof_neg)*p2; } } const u_int& ROC::minimum_size(void) const { return minimum_size_; } size_t ROC::n(void) const { return vec_pair_.size(); return n_pos()+n_neg(); } size_t ROC::n_neg(void) const { return w_neg_; } size_t ROC::n_pos(void) const { return nof_pos_; } double ROC::p_value(void) const { if (weighted_) return 1.0; else if (nof_pos_ < minimum_size_ & n()-nof_pos_ < minimum_size_) return get_p_exact(area_*nof_pos_*(n()-nof_pos_), nof_pos_, n()-nof_pos_); else return get_p_approx(area_); } double ROC::score(const classifier::Target& target, const utility::vector& value) { assert(target.size()==value.size()); weighted_=false; vec_pair_.clear(); vec_pair_.reserve(target.size()); for (size_t i=0; i()); area_ = 0; nof_pos_=0; for (size_t i=0; i()); area_=0; nof_pos_=0; double max_area=0; for (size_t i=0; ivalue.data(j)) area_+=value.weight(i)*value.weight(j); max_area+=value.weight(i)*value.weight(j); } area_/=max_area; return area_; double p = 2*p_value_one_sided(); return std::min(p, 2-p); } // Peter, should be possible to do this in NlogN double ROC::score(const classifier::Target& target, const utility::vector& value, const utility::vector& weight) double ROC::p_value_one_sided() const { weighted_=true; vec_pair_.clear(); vec_pair_.reserve(target.size()); for (unsigned int i=0; i()); area_=0; nof_pos_=0; double max_area=0; for (size_t i=0; ivalue(j)) area_+=weight(i)*weight(j); max_area+=weight(i)*weight(j); } area_/=max_area; return area_; double area(area_); if (area_==-1){ AUC auc(false); area = auc.score(multimap_); } if (n_pos() < minimum_size_ && n_neg() < minimum_size_) { // for small areas we calculate probabilitu to get larger area - // not larger or equal. if (area<0.5) return 1-get_p_exact((1-area)*n_pos()*n_neg(), n_pos(), n_neg()); return get_p_exact(area*n_pos()*n_neg(), n_pos(), n_neg()); } return get_p_approx(area); } bool ROC::target(const size_t i) const void ROC::reset(void) { return vec_pair_[i].first; } std::ostream& operator<<(std::ostream& s, const ROC& r) { s.setf( std::ios::dec ); s.precision(12); double sens = 1; double spec = 0; size_t n = r.n(); double nof_pos = r.n_pos(); for(size_t i=0; i
• ## trunk/yat/statistics/ROC.h

 r779 */ #include #include #include #include namespace theplu { virtual ~ROC(void); /** Adding a data value to ROC. */ void add(double value, bool target, double weight=1.0); /** The area is defines as \f$\frac{\sum w^+w^-} {\sum w^+w^-}\f$, where the sum in the numerator goes over all pairs where value+ is larger than value-. The denominator goes over all pairs. @return Area under curve. */ double area(void); /// /// minimum_size is the threshold for when a normal u_int& minimum_size(void); /** minimum_size is the threshold for when a normal approximation is used for the p-value calculation. @return const reference to minimum_size */ const u_int& minimum_size(void) const; /// /// @return number of samples /// @return sum of weights /// size_t n(void) const; /// /// @return number of positive samples (Target.binary()==true) /// @return sum of weights with negative target /// size_t n_neg(void) const; /// /// @return sum of weights with positive target /// size_t n_pos(void) const; ///the second distribution by a non-zero amount. If the smallest ///group size is larger than minimum_size (default = 10), then P ///is calculated using a normal approximation.  @return the ///one-sided p-value( if absolute true is used this is equivalent ///to the two-sided p-value.) ///is calculated using a normal approximation. /// /// \note Weights should be either zero or unity, else present /// implementation is nonsense. /// /// @return One-sided p-value. /// double p_value(void) const; double p_value_one_sided(void) const; /// Function taking \a value, \a target (+1 or -1) and vector /// defining what samples to use. The score is equivalent to /// Mann-Whitney statistics. /// @return the area under the ROC curve. If the area is less /// than 0.5 and absolute=true, 1-area is returned. Complexity is /// \f$N\log N \f$ where \f$N \f$ is number of samples. /// double score(const classifier::Target& target, const utility::vector& value); /** Function taking values, target, weight and a vector defining what samples to use. The area is defines as \f$\frac{\sum w^+w^-}{\sum w^+w^-}\f$, where the sum in the numerator goes over all pairs where value+ is larger than value-. The denominator goes over all pairs. If target is equal to 1, sample belonges to class + otherwise sample belongs to class -. @return wheighted version of area under the ROC curve. If the area is less than 0.5 and absolute=true, 1-area is returned. Complexity is \f$N^2 \f$ where \f$N \f$ is number of samples. /** @brief Two-sided p-value. @return min(2*p_value_one_sided, 2-2*p_value_one_sided) */ double score(const classifier::Target& target, const classifier::DataLookupWeighted1D& value); double p_value(void) const; /** Function taking values, target, weight and a vector defining what samples to use. The area is defines as \f$\frac{\sum w^+w^-}{\sum w^+w^-}\f$, where the sum in the numerator goes over all pairs where value+ is larger than value-. The denominator goes over all pairs. If target is equal to 1, sample belonges to class + otherwise sample belongs to class -. @return wheighted version of area under the ROC curve. If the area is less than 0.5 and absolute=true, 1-area is returned. Complexity is \f$N^2 \f$ where \f$N \f$ is number of samples. /** @brief Set everything to zero */ double score(const classifier::Target& target, const utility::vector& value, const utility::vector& weight); /// /// Function returning true if target is positive (binary()) for /// the sample with ith lowest data value, so i=0 corresponds to /// the sample with the lowest data value and i=n()-1 the sample /// with highest data value. /// bool target(const size_t i) const; void reset(void); private: /// Implemented as in MatLab 13.1 double get_p_approx(const double) const; double get_p_approx(double) const; /// Implemented as in MatLab 13.1 double area_; u_int minimum_size_; u_int nof_pos_; std::vector > vec_pair_; // class-value-pair bool weighted_; double w_neg_; double w_pos_; // > std::multimap > multimap_; }; /// /// The output operator for the ROC class. The output is an Nx2 /// matrix, where the first column is the sensitivity and second /// is the specificity. /// std::ostream& operator<< (std::ostream& s, const ROC&); }}} // of namespace statistics, yat, and theplu
Note: See TracChangeset for help on using the changeset viewer.