1 | #ifndef _theplu_yat_statistics_fisher_ |
---|
2 | #define _theplu_yat_statistics_fisher_ |
---|
3 | |
---|
4 | // $Id: Fisher.h 1624 2008-11-12 22:10:52Z peter $ |
---|
5 | |
---|
6 | /* |
---|
7 | Copyright (C) 2004, 2005 Peter Johansson |
---|
8 | Copyright (C) 2006, 2007 Jari Häkkinen, Peter Johansson |
---|
9 | Copyright (C) 2008 Peter Johansson |
---|
10 | |
---|
11 | This file is part of the yat library, http://dev.thep.lu.se/yat |
---|
12 | |
---|
13 | The yat library is free software; you can redistribute it and/or |
---|
14 | modify it under the terms of the GNU General Public License as |
---|
15 | published by the Free Software Foundation; either version 2 of the |
---|
16 | License, or (at your option) any later version. |
---|
17 | |
---|
18 | The yat library is distributed in the hope that it will be useful, |
---|
19 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
---|
20 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
---|
21 | General Public License for more details. |
---|
22 | |
---|
23 | You should have received a copy of the GNU General Public License |
---|
24 | along with this program; if not, write to the Free Software |
---|
25 | Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA |
---|
26 | 02111-1307, USA. |
---|
27 | */ |
---|
28 | |
---|
29 | #include "Score.h" |
---|
30 | |
---|
31 | #include <cmath> |
---|
32 | |
---|
33 | namespace theplu { |
---|
34 | namespace yat { |
---|
35 | namespace utility { |
---|
36 | class vector; |
---|
37 | } |
---|
38 | namespace statistics { |
---|
39 | /** |
---|
40 | @brief Fisher's exact test. |
---|
41 | |
---|
42 | Fisher's Exact test is a procedure that you can use for data |
---|
43 | in a two by two contingency table: \f[ \begin{tabular}{|c|c|} |
---|
44 | \hline a&b \tabularnewline \hline c&d \tabularnewline \hline |
---|
45 | \end{tabular} \f] Fisher's Exact Test is based on exact |
---|
46 | probabilities from a specific distribution (the hypergeometric |
---|
47 | distribution). There's really no lower bound on the amount of |
---|
48 | data that is needed for Fisher's Exact Test. You do have to |
---|
49 | have at least one data value in each row and one data value in |
---|
50 | each column. If an entire row or column is zero, then you |
---|
51 | don't really have a 2 by 2 table. But you can use Fisher's |
---|
52 | Exact Test when one of the cells in your table has a zero in |
---|
53 | it. Fisher's Exact Test is also very useful for highly |
---|
54 | imbalanced tables. If one or two of the cells in a two by two |
---|
55 | table have numbers in the thousands and one or two of the |
---|
56 | other cells has numbers less than 5, you can still use |
---|
57 | Fisher's Exact Test. For very large tables (where all four |
---|
58 | entries in the two by two table are large), your computer may |
---|
59 | take too much time to compute Fisher's Exact Test. In these |
---|
60 | situations, though, you might as well use the Chi-square test |
---|
61 | because a large sample approximation (that the Chi-square test |
---|
62 | relies on) is very reasonable. If all elements are larger than |
---|
63 | 10 a Chi-square test is reasonable to use. |
---|
64 | |
---|
65 | @note The statistica assumes that each column and row sum, |
---|
66 | respectively, are fixed. Just because you have a 2x2 table, this |
---|
67 | assumtion does not necessarily match you experimental upset. See |
---|
68 | e.g. Barnard's test for alternative. |
---|
69 | */ |
---|
70 | |
---|
71 | class Fisher |
---|
72 | { |
---|
73 | |
---|
74 | public: |
---|
75 | /// |
---|
76 | /// Default Constructor. |
---|
77 | /// |
---|
78 | Fisher(void); |
---|
79 | |
---|
80 | /// |
---|
81 | /// Destructor |
---|
82 | /// |
---|
83 | virtual ~Fisher(void); |
---|
84 | |
---|
85 | |
---|
86 | /** |
---|
87 | The Chi2 score is calculated as \f$ \sum |
---|
88 | \frac{(O_i-E_i)^2}{E_i}\f$ where \a E is expected value and \a |
---|
89 | O is observed value. |
---|
90 | |
---|
91 | \return Chi2 score |
---|
92 | */ |
---|
93 | double Chi2(void) const; |
---|
94 | |
---|
95 | /** |
---|
96 | Calculates the expected values under the null hypothesis. |
---|
97 | \f$ a' = \frac{(a+c)(a+b)}{a+b+c+d} \f$, |
---|
98 | \f$ b' = \frac{(a+b)(b+d)}{a+b+c+d} \f$, |
---|
99 | \f$ c' = \frac{(a+c)(c+d)}{a+b+c+d} \f$, |
---|
100 | \f$ d' = \frac{(b+d)(c+d)}{a+b+c+d} \f$, |
---|
101 | */ |
---|
102 | void expected(double& a, double& b, double& c, double& d) const; |
---|
103 | |
---|
104 | /// |
---|
105 | /// If all elements in table is at least minimum_size(), a Chi2 |
---|
106 | /// approximation is used for p-value calculation. |
---|
107 | /// |
---|
108 | /// @return reference to minimum_size |
---|
109 | /// |
---|
110 | unsigned int& minimum_size(void); |
---|
111 | |
---|
112 | /// |
---|
113 | /// If all elements in table is at least minimum_size(), a Chi2 |
---|
114 | /// approximation is used for p-value calculation. |
---|
115 | /// |
---|
116 | /// @return const reference to minimum_size |
---|
117 | /// |
---|
118 | const unsigned int& minimum_size(void) const; |
---|
119 | |
---|
120 | /** |
---|
121 | If all elements in table is at least minimum_size(), a Chi2 |
---|
122 | approximation is used. |
---|
123 | |
---|
124 | Otherwise a two-sided p-value is calculated using the |
---|
125 | hypergeometric distribution |
---|
126 | \f$ \sum_k P(k) \f$ where summation runs over \a k such that |
---|
127 | \f$ |k-<a>| \ge |a-<a>| \f$. |
---|
128 | |
---|
129 | \return two-sided p-value |
---|
130 | */ |
---|
131 | double p_value() const; |
---|
132 | |
---|
133 | /// |
---|
134 | /// One-sided p-value is probability to get larger (or equal) oddsratio. |
---|
135 | /// |
---|
136 | /// If all elements in table is at least minimum_size(), a Chi2 |
---|
137 | /// approximation is used. |
---|
138 | /// |
---|
139 | /// @return One-sided p-value |
---|
140 | /// |
---|
141 | double p_value_one_sided() const; |
---|
142 | |
---|
143 | /** |
---|
144 | Function calculating odds ratio from 2x2 table |
---|
145 | \f[ \begin{tabular}{|c|c|} |
---|
146 | \hline a&b \tabularnewline \hline c&d \tabularnewline \hline |
---|
147 | \end{tabular} \f] as \f$ \frac{ad}{bc} \f$ |
---|
148 | |
---|
149 | Object will remember the values of \a a, \a b, \a c, and \a d. |
---|
150 | |
---|
151 | @return odds ratio. |
---|
152 | |
---|
153 | @throw If table is invalid a runtime_error is thrown. A table |
---|
154 | is invalid if a row or column sum is zero. |
---|
155 | */ |
---|
156 | double oddsratio(const unsigned int a, const unsigned int b, |
---|
157 | const unsigned int c, const unsigned int d); |
---|
158 | |
---|
159 | private: |
---|
160 | bool calculate_p_exact() const; |
---|
161 | |
---|
162 | // two-sided |
---|
163 | double p_value_approximative(void) const; |
---|
164 | //two-sided |
---|
165 | double p_value_exact(void) const; |
---|
166 | |
---|
167 | unsigned int a_; |
---|
168 | unsigned int b_; |
---|
169 | unsigned int c_; |
---|
170 | unsigned int d_; |
---|
171 | unsigned int minimum_size_; |
---|
172 | double oddsratio_; |
---|
173 | }; |
---|
174 | |
---|
175 | }}} // of namespace statistics, yat, and theplu |
---|
176 | |
---|
177 | #endif |
---|