1 | #ifndef _theplu_yat_statistics_fisher_ |
2 | #define _theplu_yat_statistics_fisher_ |
3 | |
4 | // $Id: Fisher.h 747 2007-02-11 13:26:41Z peter $ |
5 | |
6 | /* |
7 | Copyright (C) The authors contributing to this file. |
8 | |
9 | This file is part of the yat library, http://lev.thep.lu.se/trac/yat |
10 | |
11 | The yat library is free software; you can redistribute it and/or |
12 | modify it under the terms of the GNU General Public License as |
13 | published by the Free Software Foundation; either version 2 of the |
14 | License, or (at your option) any later version. |
15 | |
16 | The yat library is distributed in the hope that it will be useful, |
17 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
18 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
19 | General Public License for more details. |
20 | |
21 | You should have received a copy of the GNU General Public License |
22 | along with this program; if not, write to the Free Software |
23 | Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA |
24 | 02111-1307, USA. |
25 | */ |
26 | |
27 | #include "Score.h" |
28 | |
29 | #include <sys/types.h> |
30 | |
31 | #include <cmath> |
32 | |
33 | namespace theplu { |
34 | namespace yat { |
35 | namespace utility { |
36 | class vector; |
37 | } |
38 | namespace statistics { |
39 | /** |
40 | @brief Fisher's exact test. |
41 | |
42 | Fisher's Exact test is a procedure that you can use for data |
43 | in a two by two contingency table: \f[ \begin{tabular}{|c|c|} |
44 | \hline a&b \tabularnewline \hline c&d \tabularnewline \hline |
45 | \end{tabular} \f] Fisher's Exact Test is based on exact |
46 | probabilities from a specific distribution (the hypergeometric |
47 | distribution). There's really no lower bound on the amount of |
48 | data that is needed for Fisher's Exact Test. You do have to |
49 | have at least one data value in each row and one data value in |
50 | each column. If an entire row or column is zero, then you |
51 | don't really have a 2 by 2 table. But you can use Fisher's |
52 | Exact Test when one of the cells in your table has a zero in |
53 | it. Fisher's Exact Test is also very useful for highly |
54 | imbalanced tables. If one or two of the cells in a two by two |
55 | table have numbers in the thousands and one or two of the |
56 | other cells has numbers less than 5, you can still use |
57 | Fisher's Exact Test. For very large tables (where all four |
58 | entries in the two by two table are large), your computer may |
59 | take too much time to compute Fisher's Exact Test. In these |
60 | situations, though, you might as well use the Chi-square test |
61 | because a large sample approximation (that the Chi-square test |
62 | relies on) is very reasonable. If all elements are larger than |
63 | 10 a Chi-square test is reasonable to use. |
64 | |
65 | @note The statistica assumes that each column and row sum, |
66 | respectively, are fixed. Just because you have a 2x2 table, this |
67 | assumtion does not necessarily match you experimental upset. See |
68 | e.g. Barnard's test for alternative. |
69 | */ |
70 | |
71 | class Fisher : public Score |
72 | { |
73 | |
74 | public: |
75 | /// |
76 | /// Default Constructor. |
77 | /// |
78 | Fisher(bool absolute=true); |
79 | |
80 | /// |
81 | /// Destructor |
82 | /// |
83 | virtual ~Fisher(void) {}; |
84 | |
85 | |
86 | /// |
87 | /// @return Chi2 score |
88 | /// |
89 | double Chi2(void) const; |
90 | |
91 | /// |
92 | /// Calculates the expected values under the null hypothesis. |
93 | /// a' = (a+c)(a+b)/(a+b+c+d) |
94 | /// |
95 | void expected(double& a, double& b, double& c, double& d) const; |
96 | |
97 | /// |
98 | /// minimum_size is the threshold for when the p-value calculation |
99 | /// is performed using a Chi2 approximation. |
100 | /// |
101 | /// @return reference to minimum_size |
102 | /// |
103 | u_int& minimum_size(void); |
104 | |
105 | /// |
106 | /// If absolute, the p-value is the two-sided p-value. If all |
107 | /// elements in table is at least minimum_size, a Chi2 |
108 | /// approximation is used. |
109 | /// |
110 | /// @return p-value |
111 | /// |
112 | /// @note in weighted case, approximation Chi2 is always used. |
113 | /// |
114 | double p_value() const; |
115 | |
116 | /// |
117 | /// Function calculating score from 2x2 table for which the |
118 | /// elements are calculated as follows \n |
119 | /// target.binary(i) sample i in group a or c otherwise in b or d |
120 | /// \f$ value(i) > \f$ value_cutoff() sample i in group a or b |
121 | /// otherwise c or d\n |
122 | /// |
123 | /// @return odds ratio. If absolute_ is true and odds ratio is |
124 | /// less than unity 1 divided by odds ratio is returned |
125 | /// |
126 | double score(const classifier::Target& target, |
127 | const utility::vector& value); |
128 | |
129 | /// |
130 | /// Weighted version of score. Each element in 2x2 table is |
131 | /// calculated as \f$ \sum w_i \f$, so when each weight is |
132 | /// unitary the same table is created as in the unweighted version |
133 | /// |
134 | /// @return odds ratio |
135 | /// |
136 | /// @see score |
137 | /// |
138 | double score(const classifier::Target& target, |
139 | const classifier::DataLookupWeighted1D& value); |
140 | |
141 | |
142 | /// |
143 | /// Weighted version of score. Each element in 2x2 table is |
144 | /// calculated as \f$ \sum w_i \f$, so when each weight is |
145 | /// unitary the same table is created as in the unweighted version |
146 | /// |
147 | /// @return odds ratio |
148 | /// |
149 | /// @see score |
150 | /// |
151 | double score(const classifier::Target& target, |
152 | const utility::vector& value, |
153 | const utility::vector& weight); |
154 | |
155 | /// |
156 | /// \f$ \frac{ad}{bc} \f$ |
157 | /// |
158 | /// @return odds ratio. If absolute_ is true and odds ratio is |
159 | /// less than unity, 1 divided by odds ratio is returned |
160 | /// |
161 | double score(const u_int a, const u_int b, |
162 | const u_int c, const u_int d); |
163 | |
164 | /// |
165 | /// Cutoff sets the limit whether a value should go into the left |
166 | /// or the right row. @see score |
167 | /// |
168 | /// @return reference to cutoff for row |
169 | /// |
170 | double& value_cutoff(void); |
171 | |
172 | private: |
173 | double oddsratio(const double a, const double b, |
174 | const double c, const double d); |
175 | |
176 | // two-sided |
177 | double p_value_approximative(void) const; |
178 | //two-sided |
179 | double p_value_exact(void) const; |
180 | |
181 | double a_; |
182 | double b_; |
183 | double c_; |
184 | double d_; |
185 | u_int minimum_size_; |
186 | double oddsratio_; |
187 | double value_cutoff_; |
188 | }; |
189 | |
190 | }}} // of namespace statistics, yat, and theplu |
191 | |
192 | #endif |
