source: trunk/lib/statistics/Fisher.h @ 449

Last change on this file since 449 was 449, checked in by Peter, 16 years ago

added approximative p-value calculation using Chi2, fixed some bugs, and extended the interface slightly

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 4.9 KB
Line 
1// $Id: Fisher.h 449 2005-12-15 20:06:10Z peter $
2
3#ifndef _theplu_statistics_fisher_
4#define _theplu_statistics_fisher_
5
6#include <c++_tools/statistics/Score.h>
7#include <c++_tools/gslapi/vector.h>
8
9#include <cmath>
10
11namespace theplu {
12namespace statistics { 
13  ///
14  /// @brief Fisher's exact test.   
15  /// Fisher's Exact test is a procedure that you can use for data
16  /// in a two by two contingency table: \f[ \begin{tabular}{|c|c|}
17  /// \hline a&b \tabularnewline \hline c&d \tabularnewline \hline
18  /// \end{tabular} \f] Fisher's Exact Test is based on exact
19  /// probabilities from a specific distribution (the hypergeometric
20  /// distribution). There's really no lower bound on the amount of
21  /// data that is needed for Fisher's Exact Test. You do have to
22  /// have at least one data value in each row and one data value in
23  /// each column. If an entire row or column is zero, then you
24  /// don't really have a 2 by 2 table. But you can use Fisher's
25  /// Exact Test when one of the cells in your table has a zero in
26  /// it. Fisher's Exact Test is also very useful for highly
27  /// imbalanced tables. If one or two of the cells in a two by two
28  /// table have numbers in the thousands and one or two of the
29  /// other cells has numbers less than 5, you can still use
30  /// Fisher's Exact Test. For very large tables (where all four
31  /// entries in the two by two table are large), your computer may
32  /// take too much time to compute Fisher's Exact Test. In these
33  /// situations, though, you might as well use the Chi-square test
34  /// because a large sample approximation (that the Chi-square test
35  /// relies on) is very reasonable. If all elements are larger than
36  /// 10 a Chi-square test is reasonable to use.
37  ///
38  /// @note The statistica assumes that each column and row sum,
39  /// respectively, are fixed. Just because you have a 2x2 table, this
40  /// assumtion does not necessarily match you experimental upset. See
41  /// e.g. Barnard's test for alternative.
42  ///
43 
44  class Fisher : public Score
45  {
46 
47  public:
48    ///
49    /// Default Constructor.
50    ///
51    Fisher(bool absolute=true);
52
53    ///
54    /// Destructor
55    ///
56    virtual ~Fisher(void) {};
57         
58   
59    ///
60    /// @return Chi2 score
61    ///
62    double Chi2(void) const;
63
64    ///
65    /// Cutoff sets the limit whether a value should go into the left
66    /// or the right column. @see score
67    ///
68    /// @return reference to cutoff for column
69    ///
70    inline double& cutoff_column(void) { return cutoff_column_; }
71
72    ///
73    /// Cutoff sets the limit whether a value should go into the left
74    /// or the right row. @see score
75    ///
76    /// @return reference to cutoff for row
77    ///
78    inline double& cutoff_row(void) { return cutoff_row_; }
79
80    ///
81    /// Calculates the expected values under the null hypothesis.
82    /// a' = (a+c)(a+b)/(a+b+c+d)
83    ///
84    void expected(double& a, double& b, double& c, double& d) const;
85
86    ///
87    /// minimum_size is the threshold for when the p-value calculation
88    /// is performed using a Chi2 approximation.
89    ///
90    /// @return reference to minimum_size
91    ///
92    inline u_int& minimum_size(void){ return minimum_size_; } 
93
94    ///
95    /// If absolute, the p-value is the two-sided p-value. If all
96    /// elements in table is at least minimum_size, a Chi2
97    /// approximation is used.
98    ///
99    /// @return p-value
100    ///
101    double p_value() const;
102   
103    ///
104    /// Function calculating score from 2x2 table for which the
105    /// elements are calculated as follows \n
106    /// a: #data \f$ x=1 \f$ AND \f$ y=1 \f$ \n
107    /// b: #data \f$ x=-1 \f$ AND \f$ y=1 \f$ \n
108    /// c: #data \f$ x=1 \f$ AND \f$ y=-1 \f$ \n
109    /// d: #data \f$ x=-1 \f$ AND \f$ y=1 \f$ \n
110    ///
111    /// @return odds ratio. If absolute_ is true and odds ratio is
112    /// less than unity 1 divided by odds ratio is returned
113    ///
114    double score(const gslapi::vector& x, const gslapi::vector& y, 
115                 const std::vector<size_t>& = std::vector<size_t>());
116
117    ///
118    /// Weighted version of score. Each element in 2x2 table is
119    /// calculated as \f$ \sum w_i \f$, so when each weight is
120    /// unitary the same table is created as in the unweighted version
121    ///
122    /// @return odds ratio
123    ///
124    /// @note
125    ///
126    double score(const gslapi::vector& x, const gslapi::vector& y, 
127                 const gslapi::vector& w,
128                 const std::vector<size_t>& = std::vector<size_t>());
129
130    ///
131    /// \f$ \frac{ad}{bc} \f$
132    ///
133    /// @return odds ratio. If absolute_ is true and odds ratio is
134    /// less than unity, 1 divided by odds ratio is returned
135    ///
136    double score(const u_int a, const u_int b, 
137                 const u_int c, const u_int d); 
138   
139
140         
141  private:
142    double oddsratio(const double a, const double b, 
143                     const double c, const double d);
144
145    // two-sided
146    double p_value_approximative(void) const;
147    //two-sided
148    double p_value_exact(void) const;
149
150    std::vector<size_t> train_set_;
151    gslapi::vector weight_;
152    u_int a_;
153    u_int b_;
154    u_int c_;
155    u_int d_;
156    double cutoff_column_;
157    double cutoff_row_;
158    u_int minimum_size_;
159    double oddsratio_;
160  };
161
162}} // of namespace statistics and namespace theplu
163
164#endif
165
Note: See TracBrowser for help on using the repository browser.