1 | // $Id: Fisher.h 556 2006-03-08 13:09:48Z peter $ |
---|
2 | |
---|
3 | #ifndef _theplu_statistics_fisher_ |
---|
4 | #define _theplu_statistics_fisher_ |
---|
5 | |
---|
6 | #include <c++_tools/statistics/Score.h> |
---|
7 | #include <c++_tools/gslapi/vector.h> |
---|
8 | |
---|
9 | #include <cmath> |
---|
10 | |
---|
11 | namespace theplu { |
---|
12 | namespace statistics { |
---|
13 | /// |
---|
14 | /// @brief Fisher's exact test. |
---|
15 | /// Fisher's Exact test is a procedure that you can use for data |
---|
16 | /// in a two by two contingency table: \f[ \begin{tabular}{|c|c|} |
---|
17 | /// \hline a&b \tabularnewline \hline c&d \tabularnewline \hline |
---|
18 | /// \end{tabular} \f] Fisher's Exact Test is based on exact |
---|
19 | /// probabilities from a specific distribution (the hypergeometric |
---|
20 | /// distribution). There's really no lower bound on the amount of |
---|
21 | /// data that is needed for Fisher's Exact Test. You do have to |
---|
22 | /// have at least one data value in each row and one data value in |
---|
23 | /// each column. If an entire row or column is zero, then you |
---|
24 | /// don't really have a 2 by 2 table. But you can use Fisher's |
---|
25 | /// Exact Test when one of the cells in your table has a zero in |
---|
26 | /// it. Fisher's Exact Test is also very useful for highly |
---|
27 | /// imbalanced tables. If one or two of the cells in a two by two |
---|
28 | /// table have numbers in the thousands and one or two of the |
---|
29 | /// other cells has numbers less than 5, you can still use |
---|
30 | /// Fisher's Exact Test. For very large tables (where all four |
---|
31 | /// entries in the two by two table are large), your computer may |
---|
32 | /// take too much time to compute Fisher's Exact Test. In these |
---|
33 | /// situations, though, you might as well use the Chi-square test |
---|
34 | /// because a large sample approximation (that the Chi-square test |
---|
35 | /// relies on) is very reasonable. If all elements are larger than |
---|
36 | /// 10 a Chi-square test is reasonable to use. |
---|
37 | /// |
---|
38 | /// @note The statistica assumes that each column and row sum, |
---|
39 | /// respectively, are fixed. Just because you have a 2x2 table, this |
---|
40 | /// assumtion does not necessarily match you experimental upset. See |
---|
41 | /// e.g. Barnard's test for alternative. |
---|
42 | /// |
---|
43 | |
---|
44 | class Fisher : public Score |
---|
45 | { |
---|
46 | |
---|
47 | public: |
---|
48 | /// |
---|
49 | /// Default Constructor. |
---|
50 | /// |
---|
51 | Fisher(bool absolute=true); |
---|
52 | |
---|
53 | /// |
---|
54 | /// Destructor |
---|
55 | /// |
---|
56 | virtual ~Fisher(void) {}; |
---|
57 | |
---|
58 | |
---|
59 | /// |
---|
60 | /// @return Chi2 score |
---|
61 | /// |
---|
62 | double Chi2(void) const; |
---|
63 | |
---|
64 | /// |
---|
65 | /// Cutoff sets the limit whether a value should go into the left |
---|
66 | /// or the right row. @see score |
---|
67 | /// |
---|
68 | /// @return reference to cutoff for row |
---|
69 | /// |
---|
70 | inline double& value_cutoff(void) { return value_cutoff_; } |
---|
71 | |
---|
72 | /// |
---|
73 | /// Calculates the expected values under the null hypothesis. |
---|
74 | /// a' = (a+c)(a+b)/(a+b+c+d) |
---|
75 | /// |
---|
76 | void expected(double& a, double& b, double& c, double& d) const; |
---|
77 | |
---|
78 | /// |
---|
79 | /// minimum_size is the threshold for when the p-value calculation |
---|
80 | /// is performed using a Chi2 approximation. |
---|
81 | /// |
---|
82 | /// @return reference to minimum_size |
---|
83 | /// |
---|
84 | inline u_int& minimum_size(void){ return minimum_size_; } |
---|
85 | |
---|
86 | /// |
---|
87 | /// If absolute, the p-value is the two-sided p-value. If all |
---|
88 | /// elements in table is at least minimum_size, a Chi2 |
---|
89 | /// approximation is used. |
---|
90 | /// |
---|
91 | /// @return p-value |
---|
92 | /// |
---|
93 | /// @note in weighted case, approximation Chi2 is always used. |
---|
94 | /// |
---|
95 | double p_value() const; |
---|
96 | |
---|
97 | /// |
---|
98 | /// Function calculating score from 2x2 table for which the |
---|
99 | /// elements are calculated as follows \n |
---|
100 | /// target.binary(i) sample i in group a or c otherwise in b or d |
---|
101 | /// \f$ value(i) > \f$ value_cutoff() sample i in group a or b |
---|
102 | /// otherwise c or d\n |
---|
103 | /// |
---|
104 | /// @return odds ratio. If absolute_ is true and odds ratio is |
---|
105 | /// less than unity 1 divided by odds ratio is returned |
---|
106 | /// |
---|
107 | double score(const classifier::Target& target, |
---|
108 | const gslapi::vector& value); |
---|
109 | |
---|
110 | /// |
---|
111 | /// Weighted version of score. Each element in 2x2 table is |
---|
112 | /// calculated as \f$ \sum w_i \f$, so when each weight is |
---|
113 | /// unitary the same table is created as in the unweighted version |
---|
114 | /// |
---|
115 | /// @return odds ratio |
---|
116 | /// |
---|
117 | /// @see score |
---|
118 | /// |
---|
119 | double score(const classifier::Target& target, |
---|
120 | const gslapi::vector& value, |
---|
121 | const gslapi::vector& weight); |
---|
122 | |
---|
123 | /// |
---|
124 | /// \f$ \frac{ad}{bc} \f$ |
---|
125 | /// |
---|
126 | /// @return odds ratio. If absolute_ is true and odds ratio is |
---|
127 | /// less than unity, 1 divided by odds ratio is returned |
---|
128 | /// |
---|
129 | double score(const u_int a, const u_int b, |
---|
130 | const u_int c, const u_int d); |
---|
131 | |
---|
132 | |
---|
133 | |
---|
134 | private: |
---|
135 | double oddsratio(const double a, const double b, |
---|
136 | const double c, const double d); |
---|
137 | |
---|
138 | // two-sided |
---|
139 | double p_value_approximative(void) const; |
---|
140 | //two-sided |
---|
141 | double p_value_exact(void) const; |
---|
142 | |
---|
143 | double a_; |
---|
144 | double b_; |
---|
145 | double c_; |
---|
146 | double d_; |
---|
147 | u_int minimum_size_; |
---|
148 | double oddsratio_; |
---|
149 | double value_cutoff_; |
---|
150 | }; |
---|
151 | |
---|
152 | }} // of namespace statistics and namespace theplu |
---|
153 | |
---|
154 | #endif |
---|
155 | |
---|