Changeset 112


Ignore:
Timestamp:
Jul 7, 2004, 12:23:44 PM (18 years ago)
Author:
Peter
Message:

added the choice to not use all data points but just the train_set

Location:
trunk/src
Files:
5 edited

Legend:

Unmodified
Added
Removed
  • trunk/src/ROC.cc

    r110 r112  
    44#include <iostream>
    55//#include <algorithm>
    6 //#include <utility>
    7 //#include <vector>
     6#include <utility>
     7#include <vector>
     8#include <cmath>
    89
    910// Thep C++ Tools
     
    1516namespace cpptools { 
    1617
    17   ROC::ROC(const gslapi::vector& target, const gslapi::vector& value)
     18  ROC::ROC(const gslapi::vector& target, const gslapi::vector& data,
     19           const std::vector<size_t>& train_set)
    1820 
    19     : Score(), value_(), nof_pos_(0), minimum_size_(10), area_(-1)
     21    : Score(), area_(-1), data_(data), minimum_size_(10), nof_pos_(0),
     22      target_(target), train_set_(train_set),
     23      value_(std::vector<std::pair<double, double> >())
    2024   
    2125  {
    22     sort(target, value);
     26    if (!train_set_.size())
     27      for (size_t i=0; i<target_.size(); i++)
     28        train_set_.push_back(i); 
     29    sort();
    2330  }
    2431
    2532  ROC::ROC()
    26     : Score(), value_(), nof_pos_(0), minimum_size_(10), area_(-1)
     33    : Score(), area_(-1), data_(), minimum_size_(10), nof_pos_(0), target_(),
     34      train_set_(std::vector<size_t>()),
     35      value_(std::vector<std::pair<double, double> >())
    2736       
    2837  {
     
    3847      x += 0.5/nof_pos_/(value_.size()-nof_pos_);
    3948
    40     double sigma = (std::sqrt((value_.size()-nof_pos_)* nof_pos_ *
    41                         (value_.size()+1)/12) /
    42                     (value_.size() - nof_pos_ ) / nof_pos_);
     49    double sigma = (std::sqrt( (value_.size()-nof_pos_)*nof_pos_*
     50                               (value_.size()+1.0)/12 ) /
     51                    ( value_.size() - nof_pos_ ) / nof_pos_ );
    4352    double p = gsl_cdf_gaussian_Q(x, sigma);
    4453       
     
    7887  {
    7988    if (area_==-1){
    80       double area_=0;
     89      double area_ = 0;
    8190      for (unsigned int i=0; i<value_.size(); i++)
    8291        if (value_[i].first==1)
     
    8897  }
    8998
    90   double ROC::score(const gslapi::vector& target, const gslapi::vector& value)
     99  double ROC::score(const gslapi::vector& target, const gslapi::vector& data,
     100                    const std::vector<size_t>& train_set)
    91101  {
    92     sort(target, value);
    93     double area_=0;
    94     for (unsigned int i=0; i<value_.size(); i++)
    95       if (value_[i].first==1)
    96         area_+=i;
    97     // Normalizing the area to 0-1
    98     area_ = (area_/nof_pos_ - (nof_pos_ - 1)/2 )/(value_.size() - nof_pos_);
    99 
     102    target_ = target;
     103    data_ = data;
     104    if (!train_set.size()){
     105      train_set_.resize(0);
     106      for (size_t i=0; i<target_.size(); i++)
     107        train_set_.push_back(i); 
     108    }
     109    else
     110      train_set_ = train_set;
     111    sort();
     112    area_ = score();   
    100113    return area_;
    101114  }
    102115
    103   void ROC::sort(const gslapi::vector& target, const gslapi::vector& value)
     116  void ROC::sort()
    104117  {
    105     for (unsigned int i=0; i<target.size(); i++){
    106       int targ=static_cast<int>(target(i));
    107       std::pair<int, double> tmp(targ, value(i));
     118    value_.resize(0);
     119    for (unsigned int i=0; i<train_set_.size(); i++){
     120      std::pair<double, double> tmp(target_(train_set_[i]), data_(train_set_[i]));
    108121      value_.push_back(tmp);
    109       if (targ==1)
     122      if (target_(train_set_[i])==1)
    110123        nof_pos_++;
    111124    }
    112125    std::sort(value_.begin(),value_.end(),
    113          pair_value_compare<int,double>());
     126         pair_value_compare<double, double>());
    114127  }
    115128
  • trunk/src/ROC.h

    r103 r112  
    2828    /// Default constructor
    2929    ///
    30     ROC(void);
     30    ROC();
    3131         
    3232    ///
    33     /// Constructor taking a value vector and a target vector (+1 or -1).
     33    /// Constructor taking a value vector, a target vector (+1 or -1)
     34    /// and a vector defining what samples to use.
    3435    ///
    35     ROC(const gslapi::vector&, const gslapi::vector&);
     36    ROC(const gslapi::vector&, const gslapi::vector&,
     37        const std::vector<size_t>& = std::vector<size_t>());
    3638         
    3739    ///
     
    5052    /// the ROC curve
    5153    ///
    52     double score(const gslapi::vector&, const gslapi::vector&);
     54    double score(const gslapi::vector&, const gslapi::vector&, 
     55                 const std::vector<size_t>& = std::vector<size_t>());
    5356       
    5457    ///
     
    6669         
    6770    ///
    68     /// @ return a vector of outputs that is sorted with respect to
    69     /// the corresponding score value
     71    /// @return the targets in train_set sorted with respect to the
     72    /// corresponding data
    7073    ///
    7174    gslapi::vector ROC::target(void) const;
     
    7578    /// approximation is used for the p-value calculation.
    7679    ///
    77     inline void minimum_size(const u_int minimum_size) {minimum_size_ = minimum_size; } 
     80    inline void minimum_size(const u_int minimum_size)
     81    {minimum_size_ = minimum_size; } 
    7882
    7983  private:
    80     std::vector<std::pair<int, double> > value_; //sorted pair of id and value
    81     double nof_pos_;
     84    double area_;
     85    gslapi::vector data_;
    8286    u_int minimum_size_;
    83     double area_;
     87    u_int nof_pos_;
     88    gslapi::vector target_;
     89    std::vector<size_t> train_set_;
     90    /// pair of target and data. should always be sorted with respect to
     91    /// data.
     92    std::vector<std::pair<double, double> > value_;
     93   
    8494    ///
    8595    ///
     
    96106 
    97107    ///
    98     /// sorting
     108    /// sorting value_, should always be done when changing train_set_
    99109    ///
    100     void ROC::sort(const gslapi::vector&, const gslapi::vector&);
     110    void ROC::sort();
    101111       
    102112  };
  • trunk/src/Score.h

    r102 r112  
    2727    virtual ~Score(void) {};
    2828   
    29     virtual double score(const gslapi::vector&,
    30                          const gslapi::vector&) = 0;
     29    virtual double
     30    score(const gslapi::vector&,
     31          const gslapi::vector&,
     32          const std::vector<size_t>& = std::vector<size_t>()) = 0;
    3133 
    3234    virtual double p_value() = 0;
  • trunk/src/tScore.cc

    r102 r112  
    1313
    1414  tScore::tScore()
    15       : Score(), value_(), target_()
     15    : Score(), value_(), target_(), train_set_()
    1616  {
    1717  }
    1818
    1919  tScore::tScore( const gslapi::vector& target,
    20                   const gslapi::vector& value)
     20                  const gslapi::vector& value,
     21                  const std::vector<size_t>& train_set)
    2122      : Score(), value_(value), target_(target)
    2223  {
     24    if (!train_set_.size())
     25      for (size_t i=0; i<target_.size(); i++)
     26        train_set_.push_back(i); 
    2327  }
    2428   
     
    2731    Averager positive;
    2832    Averager negative;
    29     for(size_t i=0; i<target_.size(); i++){
    30       if (target_[i]==1)
    31         positive.add(value_[i]);
     33    for(size_t i=0; i<train_set_.size(); i++){
     34      if (target_[train_set_[i]]==1)
     35        positive.add(value_[train_set_[i]]);
    3236      else
    33         negative.add(value_[i]);
     37        negative.add(value_[train_set_[i]]);
    3438    }
    3539    double diff = positive.mean() - negative.mean();
     
    3943  }
    4044
     45  double tScore::score(const gslapi::vector& target,
     46                       const gslapi::vector& value,
     47                       const std::vector<size_t>& train_set)
     48  {
     49    train_set_=train_set;
     50    if (!train_set_.size())
     51      for (size_t i=0; i<target_.size(); i++)
     52        train_set_.push_back(i); 
     53    target_ = target;
     54    value_ = value;
     55    return score();
     56  }
    4157
    4258  double tScore::p_value(void)
  • trunk/src/tScore.h

    r102 r112  
    3333    /// Constructor taking a value vector and a target vector (+1 or -1).
    3434    ///
    35     tScore(const gslapi::vector&, const gslapi::vector&);
     35    tScore(const gslapi::vector&, const gslapi::vector&,
     36           const std::vector<size_t>& = std::vector<size_t>());
    3637         
    3738    ///
     
    4950    double score();
    5051       
     52    ///
     53    /// Calculates the t-score, i.e. the ratio between difference in
     54    /// mean and standard deviation of this difference.
     55    /// @return \f$ \frac{\frac{1}{n_x}\sum x_i - \frac{1}{n_y}\sum y_i}
     56    /// {\frac{\sum x_i^2 + \sum y_i^2}{n_x-1+n_y-1}} \f$
     57    ///
     58    double score(const gslapi::vector&, const gslapi::vector&, 
     59                 const std::vector<size_t>& = std::vector<size_t>());
     60
    5161    ///
    5262    ///Calculates the p-value, i.e. the probability of observing a t-score
     
    6070    gslapi::vector value_;
    6171    gslapi::vector target_;
    62    
     72    std::vector<size_t> train_set_;
    6373       
    6474  };
Note: See TracChangeset for help on using the changeset viewer.