source: trunk/yat/statistics/Fisher.h @ 760

Last change on this file since 760 was 760, checked in by Peter, 15 years ago

throw exception when detecting invalid 2x2 table. Fixes #26

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 5.9 KB
Line 
1#ifndef _theplu_yat_statistics_fisher_
2#define _theplu_yat_statistics_fisher_
3
4// $Id: Fisher.h 760 2007-02-20 09:58:33Z peter $
5
6/*
7  Copyright (C) The authors contributing to this file.
8
9  This file is part of the yat library, http://lev.thep.lu.se/trac/yat
10
11  The yat library is free software; you can redistribute it and/or
12  modify it under the terms of the GNU General Public License as
13  published by the Free Software Foundation; either version 2 of the
14  License, or (at your option) any later version.
15
16  The yat library is distributed in the hope that it will be useful,
17  but WITHOUT ANY WARRANTY; without even the implied warranty of
18  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19  General Public License for more details.
20
21  You should have received a copy of the GNU General Public License
22  along with this program; if not, write to the Free Software
23  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
24  02111-1307, USA.
25*/
26
27#include "Score.h"
28
29#include <sys/types.h>
30
31#include <cmath>
32
33namespace theplu {
34namespace yat {
35namespace utility {
36  class vector;
37}
38namespace statistics { 
39  /**
40     @brief Fisher's exact test.   
41
42     Fisher's Exact test is a procedure that you can use for data
43     in a two by two contingency table: \f[ \begin{tabular}{|c|c|}
44     \hline a&b \tabularnewline \hline c&d \tabularnewline \hline
45     \end{tabular} \f] Fisher's Exact Test is based on exact
46     probabilities from a specific distribution (the hypergeometric
47     distribution). There's really no lower bound on the amount of
48     data that is needed for Fisher's Exact Test. You do have to
49     have at least one data value in each row and one data value in
50     each column. If an entire row or column is zero, then you
51     don't really have a 2 by 2 table. But you can use Fisher's
52     Exact Test when one of the cells in your table has a zero in
53     it. Fisher's Exact Test is also very useful for highly
54     imbalanced tables. If one or two of the cells in a two by two
55     table have numbers in the thousands and one or two of the
56     other cells has numbers less than 5, you can still use
57     Fisher's Exact Test. For very large tables (where all four
58     entries in the two by two table are large), your computer may
59     take too much time to compute Fisher's Exact Test. In these
60     situations, though, you might as well use the Chi-square test
61     because a large sample approximation (that the Chi-square test
62     relies on) is very reasonable. If all elements are larger than
63     10 a Chi-square test is reasonable to use.
64     
65     @note The statistica assumes that each column and row sum,
66     respectively, are fixed. Just because you have a 2x2 table, this
67     assumtion does not necessarily match you experimental upset. See
68     e.g. Barnard's test for alternative.
69  */
70 
71  class Fisher : public Score
72  {
73 
74  public:
75    ///
76    /// Default Constructor.
77    ///
78    Fisher(bool absolute=true);
79
80    ///
81    /// Destructor
82    ///
83    virtual ~Fisher(void) {};
84         
85   
86    ///
87    /// @return Chi2 score
88    ///
89    double Chi2(void) const;
90
91    ///
92    /// Calculates the expected values under the null hypothesis.
93    /// a' = (a+c)(a+b)/(a+b+c+d)
94    ///
95    void expected(double& a, double& b, double& c, double& d) const;
96
97    ///
98    /// minimum_size is the threshold for when the p-value calculation
99    /// is performed using a Chi2 approximation.
100    ///
101    /// @return reference to minimum_size
102    ///
103    u_int& minimum_size(void);
104
105    ///
106    /// If absolute, the p-value is the two-sided p-value. If all
107    /// elements in table is at least minimum_size, a Chi2
108    /// approximation is used.
109    ///
110    /// @return p-value
111    ///
112    /// @note in weighted case, approximation Chi2 is always used.
113    ///
114    double p_value() const;
115   
116    ///
117    /// Function calculating score from 2x2 table for which the
118    /// elements are calculated as follows \n
119    /// target.binary(i) sample i in group a or c otherwise in b or d
120    /// \f$ value(i) > \f$ value_cutoff() sample i in group a or b
121    /// otherwise c or d\n
122    ///
123    /// @return odds ratio. If absolute_ is true and odds ratio is
124    /// less than unity 1 divided by odds ratio is returned
125    ///
126    /// @throw If table is invalid a runtime_error is thrown.
127    ///
128    double score(const classifier::Target& target, 
129                 const utility::vector& value);
130
131    ///
132    /// Weighted version of score. Each element in 2x2 table is
133    /// calculated as \f$ \sum w_i \f$, so when each weight is
134    /// unitary the same table is created as in the unweighted version
135    ///
136    /// @return odds ratio
137    ///
138    /// @see score
139    ///
140    /// @throw If table is invalid a runtime_error is thrown.
141    ///
142    double score(const classifier::Target& target, 
143                 const classifier::DataLookupWeighted1D& value);
144
145
146    ///
147    /// Weighted version of score. Each element in 2x2 table is
148    /// calculated as \f$ \sum w_i \f$, so when each weight is
149    /// unitary the same table is created as in the unweighted version
150    ///
151    /// @return odds ratio
152    ///
153    /// @see score
154    ///
155    /// @throw If table is invalid a runtime_error is thrown.
156    ///
157    double score(const classifier::Target& target, 
158                 const utility::vector& value,
159                 const utility::vector& weight); 
160
161    ///
162    /// \f$ \frac{ad}{bc} \f$
163    ///
164    /// @return odds ratio. If absolute_ is true and odds ratio is
165    /// less than unity, 1 divided by odds ratio is returned
166    ///
167    /// @throw If table is invalid a runtime_error is thrown.
168    ///
169    double score(const u_int a, const u_int b, 
170                 const u_int c, const u_int d); 
171
172    ///
173    /// Cutoff sets the limit whether a value should go into the left
174    /// or the right row. @see score
175    ///
176    /// @return reference to cutoff for row
177    ///
178    double& value_cutoff(void);
179
180  private:
181    double oddsratio(const double a, const double b, 
182                     const double c, const double d);
183
184    // two-sided
185    double p_value_approximative(void) const;
186    //two-sided
187    double p_value_exact(void) const;
188
189    double a_;
190    double b_;
191    double c_;
192    double d_;
193    u_int minimum_size_;
194    double oddsratio_;
195    double value_cutoff_;
196  };
197
198}}} // of namespace statistics, yat, and theplu
199
200#endif
Note: See TracBrowser for help on using the repository browser.