source: branches/0.4-stable/yat/statistics/Fisher.h @ 1624

Last change on this file since 1624 was 1624, checked in by Peter, 15 years ago

fixes #461. Also modified implementation of cdf_hypergeometric_P, which may cause conflict with modifications done in trunk (refs #87). If so, go with the trunk version (which uses GSL 1.8).

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 5.3 KB
Line 
1#ifndef _theplu_yat_statistics_fisher_
2#define _theplu_yat_statistics_fisher_
3
4// $Id: Fisher.h 1624 2008-11-12 22:10:52Z peter $
5
6/*
7  Copyright (C) 2004, 2005 Peter Johansson
8  Copyright (C) 2006, 2007 Jari Häkkinen, Peter Johansson
9  Copyright (C) 2008 Peter Johansson
10
11  This file is part of the yat library, http://dev.thep.lu.se/yat
12
13  The yat library is free software; you can redistribute it and/or
14  modify it under the terms of the GNU General Public License as
15  published by the Free Software Foundation; either version 2 of the
16  License, or (at your option) any later version.
17
18  The yat library is distributed in the hope that it will be useful,
19  but WITHOUT ANY WARRANTY; without even the implied warranty of
20  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21  General Public License for more details.
22
23  You should have received a copy of the GNU General Public License
24  along with this program; if not, write to the Free Software
25  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
26  02111-1307, USA.
27*/
28
29#include "Score.h"
30
31#include <cmath>
32
33namespace theplu {
34namespace yat {
35namespace utility {
36  class vector;
37}
38namespace statistics { 
39  /**
40     @brief Fisher's exact test.   
41
42     Fisher's Exact test is a procedure that you can use for data
43     in a two by two contingency table: \f[ \begin{tabular}{|c|c|}
44     \hline a&b \tabularnewline \hline c&d \tabularnewline \hline
45     \end{tabular} \f] Fisher's Exact Test is based on exact
46     probabilities from a specific distribution (the hypergeometric
47     distribution). There's really no lower bound on the amount of
48     data that is needed for Fisher's Exact Test. You do have to
49     have at least one data value in each row and one data value in
50     each column. If an entire row or column is zero, then you
51     don't really have a 2 by 2 table. But you can use Fisher's
52     Exact Test when one of the cells in your table has a zero in
53     it. Fisher's Exact Test is also very useful for highly
54     imbalanced tables. If one or two of the cells in a two by two
55     table have numbers in the thousands and one or two of the
56     other cells has numbers less than 5, you can still use
57     Fisher's Exact Test. For very large tables (where all four
58     entries in the two by two table are large), your computer may
59     take too much time to compute Fisher's Exact Test. In these
60     situations, though, you might as well use the Chi-square test
61     because a large sample approximation (that the Chi-square test
62     relies on) is very reasonable. If all elements are larger than
63     10 a Chi-square test is reasonable to use.
64     
65     @note The statistica assumes that each column and row sum,
66     respectively, are fixed. Just because you have a 2x2 table, this
67     assumtion does not necessarily match you experimental upset. See
68     e.g. Barnard's test for alternative.
69  */
70 
71  class Fisher
72  {
73 
74  public:
75    ///
76    /// Default Constructor.
77    ///
78    Fisher(void);
79
80    ///
81    /// Destructor
82    ///
83    virtual ~Fisher(void);
84         
85   
86    /**
87       The Chi2 score is calculated as \f$ \sum
88       \frac{(O_i-E_i)^2}{E_i}\f$ where \a E is expected value and \a
89       O is observed value.
90
91       \return Chi2 score
92    */
93    double Chi2(void) const;
94
95    /**
96       Calculates the expected values under the null hypothesis.
97       \f$ a' = \frac{(a+c)(a+b)}{a+b+c+d} \f$,
98       \f$ b' = \frac{(a+b)(b+d)}{a+b+c+d} \f$,
99       \f$ c' = \frac{(a+c)(c+d)}{a+b+c+d} \f$,
100       \f$ d' = \frac{(b+d)(c+d)}{a+b+c+d} \f$,
101    */
102    void expected(double& a, double& b, double& c, double& d) const;
103
104    ///
105    /// If all elements in table is at least minimum_size(), a Chi2
106    /// approximation is used for p-value calculation.
107    ///
108    /// @return reference to minimum_size
109    ///
110    unsigned int& minimum_size(void);
111
112    ///
113    /// If all elements in table is at least minimum_size(), a Chi2
114    /// approximation is used for p-value calculation.
115    ///
116    /// @return const reference to minimum_size
117    ///
118    const unsigned int& minimum_size(void) const;
119
120    /**
121       If all elements in table is at least minimum_size(), a Chi2
122       approximation is used.
123       
124       Otherwise a two-sided p-value is calculated using the
125       hypergeometric distribution
126       \f$ \sum_k P(k) \f$ where summation runs over \a k such that
127       \f$ |k-<a>| \ge |a-<a>| \f$.
128
129       \return two-sided p-value
130    */
131    double p_value() const;
132   
133    ///
134    /// One-sided p-value is probability to get larger (or equal) oddsratio.
135    ///
136    /// If all elements in table is at least minimum_size(), a Chi2
137    /// approximation is used.
138    ///
139    /// @return One-sided p-value
140    ///
141    double p_value_one_sided() const;
142   
143    /**
144       Function calculating odds ratio from 2x2 table
145       \f[ \begin{tabular}{|c|c|}
146       \hline a&b \tabularnewline \hline c&d \tabularnewline \hline
147       \end{tabular} \f] as \f$ \frac{ad}{bc} \f$
148
149       Object will remember the values of \a a, \a b, \a c, and \a d.
150
151       @return odds ratio.
152
153       @throw If table is invalid a runtime_error is thrown. A table
154       is invalid if a row or column sum is zero.
155    */
156    double oddsratio(const unsigned int a, const unsigned int b, 
157                     const unsigned int c, const unsigned int d);
158
159  private:
160    bool calculate_p_exact() const;
161
162    // two-sided
163    double p_value_approximative(void) const;
164    //two-sided
165    double p_value_exact(void) const;
166
167    unsigned int a_;
168    unsigned int b_;
169    unsigned int c_;
170    unsigned int d_;
171    unsigned int minimum_size_;
172    double oddsratio_;
173  };
174
175}}} // of namespace statistics, yat, and theplu
176
177#endif
Note: See TracBrowser for help on using the repository browser.