Changeset 186


Ignore:
Timestamp:
Oct 7, 2004, 8:51:13 PM (18 years ago)
Author:
Peter
Message:

Moving Fisher's exact test from Statistics.cc to having its own class inherit from abstract score base class

Location:
trunk/src
Files:
2 added
11 edited

Legend:

Unmodified
Added
Removed
  • trunk/src/Makefile.am

    r183 r186  
    1010  Alignment.cc Averager.cc AveragerPair.cc ConsensusInputRanker.cc \
    1111  CrossValidation.cc \
    12   FileIO.cc GaussianKernelFunction.cc histogram.cc InputRanker.cc \
     12  Fisher.cc FileIO.cc GaussianKernelFunction.cc histogram.cc InputRanker.cc \
    1313  Kernel.cc kNNI.cc matrix.cc Merge.cc NNI.cc PCA.cc \
    1414  PolynomialKernelFunction.cc random_singleton.cc ROC.cc Score.cc \
     
    2323  Alignment.h Averager.h AveragerPair.h ConsensusInputRanker.h \
    2424  CrossValidation.h \
    25   FileIO.h GaussianKernelFunction.h histogram.h InputRanker.h Kernel.h \
     25  FileIO.h Fisher.h GaussianKernelFunction.h histogram.h InputRanker.h
     26  Kernel.h \
    2627  KernelFunction.h kNNI.h matrix.h Merge.h NNI.h PCA.h Pearson.h \
    2728  PolynomialKernelFunction.h random_singleton.h ROC.h Score.h \
  • trunk/src/Pearson.cc

    r181 r186  
    3030  double Pearson::p_value();
    3131  {
    32     if(weighted_){
    33       std::cerr << "Warning: p_value for " <<
    34                 << "weighted Pearson correlation not implemented\n";
     32    if(weighted_)
    3533      return 1;
    36     }
    3734    else if(nof_samples_<3){
    3835      std::cerr << "Warning: Only " << nof_samples_ << "samples. " <<
     
    5047                        const std::vector<size_t>& = std::vector<size_t>());
    5148  {
     49    weighted_=false;
    5250    if (!train_set.size()){
    5351      AveragerPair x(vec1,vec2);
     
    7472                        std::vector<size_t>());
    7573  {
     74    weighted_=true;
    7675    if (!train_set.size()){
    7776      gslapi::vector x = vec1;
     
    110109                        std::vector<size_t>());
    111110  {
     111    weighted_=true;
    112112    return = score(vec1, vec2, w1.mul_elements(w2),train_set);
    113    
    114113  }
    115114
  • trunk/src/Pearson.h

    r181 r186  
    9292    double r_;
    9393    int nof_samples_;
    94     bool weighted_;
     94
    9595
    9696    void centralize(vector, const vector);
  • trunk/src/ROC.cc

    r181 r186  
    1919  ROC::ROC(bool b)
    2020    : Score(b), area_(-1), minimum_size_(10), nof_pos_(0), 
    21       train_set_(std::vector<size_t>()),
    22       value_(std::vector<std::pair<double, double> >()),
    23       weight_(gslapi::vector())
     21      value_(std::vector<std::pair<double, double> >())
     22     
    2423       
    2524  {
     
    6160  double ROC::p_value(void)
    6261  {
    63     double p;
    64     if (nof_pos_ < minimum_size_ & value_.size()-nof_pos_ < minimum_size_)
    65       p = get_p_exact(area_*nof_pos_*(value_.size()-nof_pos_),
     62    if (!weighted_)
     63      return 1.0;
     64    else if (nof_pos_ < minimum_size_ & value_.size()-nof_pos_ < minimum_size_)
     65      return get_p_exact(area_*nof_pos_*(value_.size()-nof_pos_),
    6666                          nof_pos_, value_.size()-nof_pos_);
    6767    else
    68       p = get_p_approx(area_);
    69     return p;
     68      return get_p_approx(area_);
     69   
    7070  }
    7171
     
    7373                    const std::vector<size_t>& train_set)
    7474  {
     75    weighted_=false;
    7576    target_ = target;
    7677    data_ = data;
     
    103104                    const std::vector<size_t>& train_set)
    104105  {
     106    weighted_=true;
    105107    target_ = target;
    106108    data_ = data;
  • trunk/src/ROC.h

    r181 r186  
    9191    u_int minimum_size_;
    9292    u_int nof_pos_;
    93     std::vector<size_t> train_set_;
    94     std::vector<std::pair<double, double> > value_;
    9593    /// pair of target and data. should always be sorted with respect to
    9694    /// data.
    97     gslapi::vector weight_;
     95    std::vector<std::pair<double, double> > value_;
     96
    9897   
    9998    ///
  • trunk/src/SVM.cc

    r182 r186  
    3131    trained_(false),
    3232    train_set_(train_set),
    33     tolerance_(0.000000000001)
     33    tolerance_(0.000001)
    3434       
    3535  {
  • trunk/src/Score.cc

    r179 r186  
    77
    88  Score::Score(bool absolute)
    9     : absolute_(absolute)
     9    : absolute_(absolute), train_set_(std::vector<size_t>()), weighted_(true)
    1010  {
    1111  }
  • trunk/src/Score.h

    r179 r186  
    1010
    1111  ///
    12   ///Virtual Class calculating a score and the corresponding p-value.
     12  /// Score is an abstract class defining the interface for the score classes.
    1313  ///
    1414  class Score
     
    4141          const gslapi::vector&,
    4242          const std::vector<size_t>& = std::vector<size_t>()) = 0;
     43
     44    virtual double p_value() const = 0;
    4345 
    4446   
     
    4850    gslapi::vector data_;   
    4951    gslapi::vector target_;
    50 
     52    std::vector<size_t> train_set_;
     53    gslapi::vector weight_;
     54    bool weighted_;
    5155
    5256  }; // class Score
  • trunk/src/Statistics.cc

    r169 r186  
    2828      p+= gsl_ran_hypergeometric_pdf(i, n1, n2, t);
    2929    return p;
    30   }
    31 
    32   double fisher(u_int a, u_int b, u_int c, u_int d)
    33   {
    34     // Since the calculation is symmetric and cdf_hypergeometric_P
    35     // loops up to k we choose the samllest number to be k and mirror
    36     // the matrix.
    37     if (a<b && a<c && a<d)
    38       return cdf_hypergeometric_P(a,a+b,c+d,a+c);
    39     else if (b<a && b<c && b<d)
    40       return cdf_hypergeometric_P(b,a+b,c+d,b+d);
    41     else if (c<a && c<b && c<d)
    42       return cdf_hypergeometric_P(c,c+d,a+b,a+c);
    43     else
    44       return cdf_hypergeometric_P(d,c+d,a+b,b+d);
    4530  }
    4631
  • trunk/src/Statistics.h

    r171 r186  
    3838    double cdf_hypergeometric_P(u_int k, u_int n1, u_int n2, u_int t);
    3939
    40     ///
    41     /// Fisher's Exact test is a procedure that you can use for data
    42     /// in a two by two contingency table: \f[ \begin{tabular}{|c|c|}
    43     /// \hline a&b \tabularnewline \hline c&d \tabularnewline \hline
    44     /// \end{tabular} \f] Fisher's Exact Test is based on exact
    45     /// probabilities from a specific distribution (the hypergeometric
    46     /// distribution). There's really no lower bound on the amount of
    47     /// data that is needed for Fisher's Exact Test. You do have to
    48     /// have at least one data value in each row and one data value in
    49     /// each column. If an entire row or column is zero, then you
    50     /// don't really have a 2 by 2 table. But you can use Fisher's
    51     /// Exact Test when one of the cells in your table has a zero in
    52     /// it. Fisher's Exact Test is also very useful for highly
    53     /// imbalanced tables. If one or two of the cells in a two by two
    54     /// table have numbers in the thousands and one or two of the
    55     /// other cells has numbers less than 5, you can still use
    56     /// Fisher's Exact Test. For very large tables (where all four
    57     /// entries in the two by two table are large), your computer may
    58     /// take too much time to compute Fisher's Exact Test. In these
    59     /// situations, though, you might as well use the Chi-square test
    60     /// because a large sample approximation (that the Chi-square test
    61     /// relies on) is very reasonable. If all elements are larger than
    62     /// 10 a Chi-square test is reasonable to use. @return one-sided
    63     /// p-value for Fisher's exact test, i.e. the probablity to get an
    64     /// odds \f$ ad/bc\f$ or extremer given row sums \f$ a+b \f$ and \f$
    65     /// c+d \f$ and column sums \f$ a+c \f$ and \f$ b+d \f$.
    66     ///
    67     double fisher(u_int a, u_int b, u_int c, u_int d); 
    6840
    6941    ///
  • trunk/src/tScore.cc

    r181 r186  
    2222                       const std::vector<size_t>& train_set)
    2323  {
     24    weighted_=false;
    2425    if (!train_set_.size())
    2526      for (size_t i=0; i<target_.size(); i++)
     
    4950
    5051  double tScore::score(const gslapi::vector& target,
    51                        const gslapi::vector& value,
     52                       const gslapi::vector& data,
    5253                       const gslapi::vector& weight,
    5354                       const std::vector<size_t>& train_set)
    5455  {
     56    weighted_=true;
    5557    if (!train_set_.size())
    5658      for (size_t i=0; i<target_.size(); i++)
     
    8284    double dof = target_.size()-2;
    8385    double p = gsl_cdf_tdist_Q(t_, dof);
    84     return dof > 0 ? p : 1;
     86    return (dof > 0 && !weighted_) ? p : 1;
    8587  }
    8688
Note: See TracChangeset for help on using the changeset viewer.