Changeset 777 for trunk/yat/statistics


Ignore:
Timestamp:
Mar 4, 2007, 1:34:17 PM (15 years ago)
Author:
Peter
Message:

Changed Fisher interface dramatically. No longer inherited from Score. Removed several functions since they encourage inappropriate usage. Removed support for weights. refs #101

Location:
trunk/yat/statistics
Files:
2 edited

Legend:

Unmodified
Added
Removed
  • trunk/yat/statistics/Fisher.cc

    r760 r777  
    2323
    2424#include "Fisher.h"
    25 #include "Score.h"
    2625#include "utility.h"
    27 #include "yat/classifier/DataLookupWeighted1D.h"
    28 #include "yat/classifier/Target.h"
    2926
    3027#include <gsl/gsl_cdf.h>
     
    3633
    3734 
    38   Fisher::Fisher(bool b)
    39     : Score(b), a_(0), b_(0), c_(0), d_(0), oddsratio_(1.0), value_cutoff_(0)
     35  Fisher::Fisher()
     36    : a_(0), b_(0), c_(0), d_(0), oddsratio_(1.0)
    4037  {
    4138  }
    4239
     40
     41  Fisher::~Fisher()
     42  {
     43  }
     44
     45
     46  bool Fisher::calculate_p_exact(void) const
     47  {
     48    return ( a_<minimum_size_ || b_<minimum_size_ ||
     49             c_<minimum_size_ || d_<minimum_size_);
     50  }
    4351
    4452  double Fisher::Chi2() const
     
    6775
    6876
    69   double Fisher::oddsratio(const double a,
    70                            const double b,
    71                            const double c,
    72                            const double d)
     77  const u_int& Fisher::minimum_size(void) const
     78  {
     79    return minimum_size_;
     80  }
     81
     82
     83  double Fisher::oddsratio(const u_int a,
     84                           const u_int b,
     85                           const u_int c,
     86                           const u_int d)
    7387  {
    7488    // If a column sum or a row sum is zero, the table is nonsense
     
    7892
    7993    oddsratio_=(a*d)/(b*d);
    80     if (absolute_ && oddsratio_<1)
    81       return 1/oddsratio_;
    8294    return oddsratio_;
    8395  }
     
    8698  double Fisher::p_value() const
    8799  {
    88     double p=1;
    89     if ( ( a_<minimum_size_ || b_<minimum_size_ ||
    90            c_<minimum_size_ || d_<minimum_size_) && !weighted_ ){
     100    if (oddsratio_>1.0)
     101      return 2*p_value_one_sided();
     102    if (oddsratio_<1.0){
     103      // If oddsratio is less than unity
     104      // Two-sided p-value is
     105      // P(X <= oddsratio) + P(X >= 1/oddsratio) =
     106      // P(X <= oddsratio) + P(X <= oddsratio)
     107      // 2 * (P(X < oddsratio) + P(X = oddsratio))
     108      // 2 * (1 - P(X >= oddsratio) + P(X = oddsratio))
     109      if (calculate_p_exact())
     110        return 2*(1-p_value_one_sided()+
     111                  gsl_ran_hypergeometric_pdf(a_,a_+b_,c_+d_, a_+c_)
     112                  );
     113      // if p is calculated approximatively correction is not needed
     114      return 2*(1-p_value_one_sided());
    91115
    92       p=p_value_exact();
    93116    }
    94     else
    95       p=p_value_approximative();
     117    return 1.0;
     118  }
    96119
    97     if (!absolute_){
    98       p=p/2;
    99       if (oddsratio_<0.5){
    100         // last term because >= not equal to !(<=)
    101         u_int a = static_cast<u_int>(a_);
    102         u_int b = static_cast<u_int>(b_);
    103         u_int c = static_cast<u_int>(c_);
    104         u_int d = static_cast<u_int>(d_);
    105         return 1-p+gsl_ran_hypergeometric_pdf(a, a+b, c+d, a+c);
    106       }
    107     }
    108     return p;
     120
     121  double Fisher::p_value_one_sided() const
     122  {
     123    if ( calculate_p_exact() )
     124      return p_value_exact();
     125    return p_value_approximative();
    109126  }
    110127
     
    117134  double Fisher::p_value_exact() const
    118135  {
    119     u_int a = static_cast<u_int>(a_);
    120     u_int b = static_cast<u_int>(b_);
    121     u_int c = static_cast<u_int>(c_);
    122     u_int d = static_cast<u_int>(d_);
    123    
    124136    // Since the calculation is symmetric and cdf_hypergeometric_P
    125137    // loops up to k we choose the smallest number to be k and mirror
    126138    // the matrix. This choice makes the p-value two-sided.
    127139
    128     if (a<b && a<c && a<d)
    129       return statistics::cdf_hypergeometric_P(a,a+b,c+d,a+c);
    130     else if (b<a && b<c && b<d)
    131       return  statistics::cdf_hypergeometric_P(b,a+b,c+d,b+d);
    132     else if (c<a && c<b && c<d)
    133       return  statistics::cdf_hypergeometric_P(c,c+d,a+b,a+c);
     140    if (a_<b_ && a_<c_ && a_<d_)
     141      return statistics::cdf_hypergeometric_P(a_,a_+b_,c_+d_,a_+c_);
     142    else if (b_<a_ && b_<c_ && b_<d_)
     143      return  statistics::cdf_hypergeometric_P(b_,a_+b_,c_+d_,b_+d_);
     144    else if (c_<a_ && c_<b_ && c_<d_)
     145      return  statistics::cdf_hypergeometric_P(c_,c_+d_,a_+b_,a_+c_);
    134146
    135     return statistics::cdf_hypergeometric_P(d,c+d,a+b,b+d);
     147    return statistics::cdf_hypergeometric_P(d_,c_+d_,a_+b_,b_+d_);
    136148
    137149  }
    138150
    139151
    140   double Fisher::score(const classifier::Target& target,
    141                        const utility::vector& value)
    142   {
    143     weighted_=false;
    144     a_=b_=c_=d_=0;
    145     for (size_t i=0; i<target.size(); i++)
    146       if (target.binary(i))
    147         if (value(i)>value_cutoff_)
    148           a_++;
    149         else
    150           c_++;
    151       else
    152         if (value(i)>value_cutoff_)
    153           b_++;
    154         else
    155           d_++;
    156        
    157     return oddsratio(a_,b_,c_,d_);
    158   }
    159  
    160   double Fisher::score(const classifier::Target& target,
    161                        const classifier::DataLookupWeighted1D& value)
    162   {
    163     weighted_=true;
    164     a_=b_=c_=d_=0;
    165     for (size_t i=0; i<target.size(); i++)
    166       if (target.binary(i))
    167         if (value.data(i)>value_cutoff_)
    168           a_+=value.weight(i);
    169         else
    170           c_+=value.weight(i);
    171       else
    172         if (value.data(i)>value_cutoff_)
    173           b_+=value.weight(i);
    174         else
    175           d_+=value.weight(i);
    176        
    177     return oddsratio(a_,b_,c_,d_);
    178   }
    179  
    180   double Fisher::score(const classifier::Target& target,
    181                        const utility::vector& value,
    182                        const utility::vector& weight)
    183   {
    184     weighted_=true;
    185     a_=b_=c_=d_=0;
    186     for (size_t i=0; i<target.size(); i++)
    187       if (target.binary(i))
    188         if (value(i)>value_cutoff_)
    189           a_+=weight(i);
    190         else
    191           c_+=weight(i);
    192       else
    193         if (value(i)>value_cutoff_)
    194           b_+=weight(i);
    195         else
    196           d_+=weight(i);
    197        
    198     return oddsratio(a_,b_,c_,d_);
    199   }
    200  
    201   double Fisher::score(const u_int a, const u_int b,
    202                        const u_int c, const u_int d)
    203   {
    204     a_=a;
    205     b_=b;
    206     c_=c;
    207     d_=d;
    208     return oddsratio(a,b,c,d);
    209   }
    210 
    211 
    212   double& Fisher::value_cutoff(void)
    213   {
    214     return value_cutoff_;
    215   }
    216 
    217152}}} // of namespace statistics, yat, and theplu
  • trunk/yat/statistics/Fisher.h

    r760 r777  
    6969  */
    7070 
    71   class Fisher : public Score
     71  class Fisher
    7272  {
    7373 
     
    7676    /// Default Constructor.
    7777    ///
    78     Fisher(bool absolute=true);
     78    Fisher(void);
    7979
    8080    ///
    8181    /// Destructor
    8282    ///
    83     virtual ~Fisher(void) {};
     83    virtual ~Fisher(void);
    8484         
    8585   
     
    8989    double Chi2(void) const;
    9090
    91     ///
    92     /// Calculates the expected values under the null hypothesis.
    93     /// a' = (a+c)(a+b)/(a+b+c+d)
    94     ///
     91    /**
     92       Calculates the expected values under the null hypothesis.
     93       \f$ a' = \frac{(a+c)(a+b)}{a+b+c+d} \f$,
     94       \f$ b' = \frac{(a+b)(b+d)}{a+b+c+d} \f$,
     95       \f$ c' = \frac{(a+c)(c+d)}{a+b+c+d} \f$,
     96       \f$ d' = \frac{(b+d)(c+d)}{a+b+c+d} \f$,
     97    */
    9598    void expected(double& a, double& b, double& c, double& d) const;
    9699
    97100    ///
    98     /// minimum_size is the threshold for when the p-value calculation
    99     /// is performed using a Chi2 approximation.
     101    /// If all elements in table is at least minimum_size(), a Chi2
     102    /// approximation is used for p-value calculation.
    100103    ///
    101104    /// @return reference to minimum_size
     
    104107
    105108    ///
    106     /// If absolute, the p-value is the two-sided p-value. If all
    107     /// elements in table is at least minimum_size, a Chi2
     109    /// If all elements in table is at least minimum_size(), a Chi2
     110    /// approximation is used for p-value calculation.
     111    ///
     112    /// @return const reference to minimum_size
     113    ///
     114    const u_int& minimum_size(void) const;
     115
     116    ///
     117    /// If oddsratio is larger than unity, two-sided p-value is equal
     118    /// to 2*p_value_one_sided(). If oddsratio is smaller than unity
     119    /// two-sided p-value is equal to 2*(1-p_value_one_sided()). If
     120    /// oddsratio is unity two-sided p-value is equal to unity.
     121    ///
     122    /// If all elements in table is at least minimum_size(), a Chi2
    108123    /// approximation is used.
    109124    ///
    110     /// @return p-value
    111     ///
    112     /// @note in weighted case, approximation Chi2 is always used.
     125    /// @return 2-sided p-value
    113126    ///
    114127    double p_value() const;
    115128   
    116129    ///
    117     /// Function calculating score from 2x2 table for which the
    118     /// elements are calculated as follows \n
    119     /// target.binary(i) sample i in group a or c otherwise in b or d
    120     /// \f$ value(i) > \f$ value_cutoff() sample i in group a or b
    121     /// otherwise c or d\n
     130    /// One-sided p-value is probability to get larger (or equal) oddsratio.
    122131    ///
    123     /// @return odds ratio. If absolute_ is true and odds ratio is
    124     /// less than unity 1 divided by odds ratio is returned
     132    /// If all elements in table is at least minimum_size(), a Chi2
     133    /// approximation is used.
    125134    ///
    126     /// @throw If table is invalid a runtime_error is thrown.
     135    /// @return One-sided p-value
    127136    ///
    128     double score(const classifier::Target& target,
    129                  const utility::vector& value);
     137    double p_value_one_sided() const;
     138   
     139    /**
     140       Function calculating odds ratio from 2x2 table
     141       \f[ \begin{tabular}{|c|c|}
     142       \hline a&b \tabularnewline \hline c&d \tabularnewline \hline
     143       \end{tabular} \f] as \f$ \frac{ad}{bc} \f$
    130144
    131     ///
    132     /// Weighted version of score. Each element in 2x2 table is
    133     /// calculated as \f$ \sum w_i \f$, so when each weight is
    134     /// unitary the same table is created as in the unweighted version
    135     ///
    136     /// @return odds ratio
    137     ///
    138     /// @see score
    139     ///
    140     /// @throw If table is invalid a runtime_error is thrown.
    141     ///
    142     double score(const classifier::Target& target,
    143                  const classifier::DataLookupWeighted1D& value);
     145       @return odds ratio.
    144146
    145 
    146     ///
    147     /// Weighted version of score. Each element in 2x2 table is
    148     /// calculated as \f$ \sum w_i \f$, so when each weight is
    149     /// unitary the same table is created as in the unweighted version
    150     ///
    151     /// @return odds ratio
    152     ///
    153     /// @see score
    154     ///
    155     /// @throw If table is invalid a runtime_error is thrown.
    156     ///
    157     double score(const classifier::Target& target,
    158                  const utility::vector& value,
    159                  const utility::vector& weight);
    160 
    161     ///
    162     /// \f$ \frac{ad}{bc} \f$
    163     ///
    164     /// @return odds ratio. If absolute_ is true and odds ratio is
    165     /// less than unity, 1 divided by odds ratio is returned
    166     ///
    167     /// @throw If table is invalid a runtime_error is thrown.
    168     ///
    169     double score(const u_int a, const u_int b,
    170                  const u_int c, const u_int d);
    171 
    172     ///
    173     /// Cutoff sets the limit whether a value should go into the left
    174     /// or the right row. @see score
    175     ///
    176     /// @return reference to cutoff for row
    177     ///
    178     double& value_cutoff(void);
     147       @throw If table is invalid a runtime_error is thrown. A table
     148       is invalid if a row or column sum is zero.
     149    */
     150    double oddsratio(const u_int a, const u_int b,
     151                     const u_int c, const u_int d);
    179152
    180153  private:
    181     double oddsratio(const double a, const double b,
    182                      const double c, const double d);
     154    bool calculate_p_exact() const;
    183155
    184156    // two-sided
     
    187159    double p_value_exact(void) const;
    188160
    189     double a_;
    190     double b_;
    191     double c_;
    192     double d_;
     161    u_int a_;
     162    u_int b_;
     163    u_int c_;
     164    u_int d_;
    193165    u_int minimum_size_;
    194166    double oddsratio_;
    195     double value_cutoff_;
    196167  };
    197168
Note: See TracChangeset for help on using the changeset viewer.