source: trunk/yat/statistics/Fisher.h @ 683

Last change on this file since 683 was 683, checked in by Jari Häkkinen, 16 years ago

Addresses #153. Clean up of code.

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 5.7 KB
Line 
1#ifndef _theplu_yat_statistics_fisher_
2#define _theplu_yat_statistics_fisher_
3
4// $Id: Fisher.h 683 2006-10-11 22:20:36Z jari $
5
6/*
7  Copyright (C) The authors contributing to this file.
8
9  This file is part of the yat library, http://lev.thep.lu.se/trac/yat
10
11  The yat library is free software; you can redistribute it and/or
12  modify it under the terms of the GNU General Public License as
13  published by the Free Software Foundation; either version 2 of the
14  License, or (at your option) any later version.
15
16  The yat library is distributed in the hope that it will be useful,
17  but WITHOUT ANY WARRANTY; without even the implied warranty of
18  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19  General Public License for more details.
20
21  You should have received a copy of the GNU General Public License
22  along with this program; if not, write to the Free Software
23  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
24  02111-1307, USA.
25*/
26
27#include "Score.h"
28#include "yat/utility/vector.h"
29
30#include <cmath>
31
32namespace theplu {
33namespace yat {
34namespace statistics { 
35  /**
36     @brief Fisher's exact test.   
37
38     Fisher's Exact test is a procedure that you can use for data
39     in a two by two contingency table: \f[ \begin{tabular}{|c|c|}
40     \hline a&b \tabularnewline \hline c&d \tabularnewline \hline
41     \end{tabular} \f] Fisher's Exact Test is based on exact
42     probabilities from a specific distribution (the hypergeometric
43     distribution). There's really no lower bound on the amount of
44     data that is needed for Fisher's Exact Test. You do have to
45     have at least one data value in each row and one data value in
46     each column. If an entire row or column is zero, then you
47     don't really have a 2 by 2 table. But you can use Fisher's
48     Exact Test when one of the cells in your table has a zero in
49     it. Fisher's Exact Test is also very useful for highly
50     imbalanced tables. If one or two of the cells in a two by two
51     table have numbers in the thousands and one or two of the
52     other cells has numbers less than 5, you can still use
53     Fisher's Exact Test. For very large tables (where all four
54     entries in the two by two table are large), your computer may
55     take too much time to compute Fisher's Exact Test. In these
56     situations, though, you might as well use the Chi-square test
57     because a large sample approximation (that the Chi-square test
58     relies on) is very reasonable. If all elements are larger than
59     10 a Chi-square test is reasonable to use.
60     
61     @note The statistica assumes that each column and row sum,
62     respectively, are fixed. Just because you have a 2x2 table, this
63     assumtion does not necessarily match you experimental upset. See
64     e.g. Barnard's test for alternative.
65  */
66 
67  class Fisher : public Score
68  {
69 
70  public:
71    ///
72    /// Default Constructor.
73    ///
74    Fisher(bool absolute=true);
75
76    ///
77    /// Destructor
78    ///
79    virtual ~Fisher(void) {};
80         
81   
82    ///
83    /// @return Chi2 score
84    ///
85    double Chi2(void) const;
86
87    ///
88    /// Cutoff sets the limit whether a value should go into the left
89    /// or the right row. @see score
90    ///
91    /// @return reference to cutoff for row
92    ///
93    inline double& value_cutoff(void) { return value_cutoff_; }
94
95    ///
96    /// Calculates the expected values under the null hypothesis.
97    /// a' = (a+c)(a+b)/(a+b+c+d)
98    ///
99    void expected(double& a, double& b, double& c, double& d) const;
100
101    ///
102    /// minimum_size is the threshold for when the p-value calculation
103    /// is performed using a Chi2 approximation.
104    ///
105    /// @return reference to minimum_size
106    ///
107    inline u_int& minimum_size(void){ return minimum_size_; } 
108
109    ///
110    /// If absolute, the p-value is the two-sided p-value. If all
111    /// elements in table is at least minimum_size, a Chi2
112    /// approximation is used.
113    ///
114    /// @return p-value
115    ///
116    /// @note in weighted case, approximation Chi2 is always used.
117    ///
118    double p_value() const;
119   
120    ///
121    /// Function calculating score from 2x2 table for which the
122    /// elements are calculated as follows \n
123    /// target.binary(i) sample i in group a or c otherwise in b or d
124    /// \f$ value(i) > \f$ value_cutoff() sample i in group a or b
125    /// otherwise c or d\n
126    ///
127    /// @return odds ratio. If absolute_ is true and odds ratio is
128    /// less than unity 1 divided by odds ratio is returned
129    ///
130    double score(const classifier::Target& target, 
131                 const utility::vector& value);
132
133    ///
134    /// Weighted version of score. Each element in 2x2 table is
135    /// calculated as \f$ \sum w_i \f$, so when each weight is
136    /// unitary the same table is created as in the unweighted version
137    ///
138    /// @return odds ratio
139    ///
140    /// @see score
141    ///
142    double score(const classifier::Target& target, 
143                 const classifier::DataLookupWeighted1D& value);
144
145
146    ///
147    /// Weighted version of score. Each element in 2x2 table is
148    /// calculated as \f$ \sum w_i \f$, so when each weight is
149    /// unitary the same table is created as in the unweighted version
150    ///
151    /// @return odds ratio
152    ///
153    /// @see score
154    ///
155    double score(const classifier::Target& target, 
156                 const utility::vector& value,
157                 const utility::vector& weight); 
158
159    ///
160    /// \f$ \frac{ad}{bc} \f$
161    ///
162    /// @return odds ratio. If absolute_ is true and odds ratio is
163    /// less than unity, 1 divided by odds ratio is returned
164    ///
165    double score(const u_int a, const u_int b, 
166                 const u_int c, const u_int d); 
167
168         
169  private:
170    double oddsratio(const double a, const double b, 
171                     const double c, const double d);
172
173    // two-sided
174    double p_value_approximative(void) const;
175    //two-sided
176    double p_value_exact(void) const;
177
178    double a_;
179    double b_;
180    double c_;
181    double d_;
182    u_int minimum_size_;
183    double oddsratio_;
184    double value_cutoff_;
185  };
186
187}}} // of namespace statistics, yat, and theplu
188
189#endif
Note: See TracBrowser for help on using the repository browser.