source: trunk/c++_tools/statistics/Fisher.h @ 675

Last change on this file since 675 was 675, checked in by Jari Häkkinen, 15 years ago

References #83. Changing project name to yat. Compilation will fail in this revision.

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 5.7 KB
Line 
1#ifndef _theplu_statistics_fisher_
2#define _theplu_statistics_fisher_
3
4// $Id: Fisher.h 675 2006-10-10 12:08:45Z jari $
5
6/*
7  Copyright (C) The authors contributing to this file.
8
9  This file is part of the yat library, http://lev.thep.lu.se/trac/yat
10
11  The yat library is free software; you can redistribute it and/or
12  modify it under the terms of the GNU General Public License as
13  published by the Free Software Foundation; either version 2 of the
14  License, or (at your option) any later version.
15
16  The yat library is distributed in the hope that it will be useful,
17  but WITHOUT ANY WARRANTY; without even the implied warranty of
18  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19  General Public License for more details.
20
21  You should have received a copy of the GNU General Public License
22  along with this program; if not, write to the Free Software
23  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
24  02111-1307, USA.
25*/
26
27#include "yat/statistics/Score.h"
28#include "yat/utility/vector.h"
29
30#include <cmath>
31
32namespace theplu {
33namespace statistics { 
34  /**
35     @brief Fisher's exact test.   
36
37     Fisher's Exact test is a procedure that you can use for data
38     in a two by two contingency table: \f[ \begin{tabular}{|c|c|}
39     \hline a&b \tabularnewline \hline c&d \tabularnewline \hline
40     \end{tabular} \f] Fisher's Exact Test is based on exact
41     probabilities from a specific distribution (the hypergeometric
42     distribution). There's really no lower bound on the amount of
43     data that is needed for Fisher's Exact Test. You do have to
44     have at least one data value in each row and one data value in
45     each column. If an entire row or column is zero, then you
46     don't really have a 2 by 2 table. But you can use Fisher's
47     Exact Test when one of the cells in your table has a zero in
48     it. Fisher's Exact Test is also very useful for highly
49     imbalanced tables. If one or two of the cells in a two by two
50     table have numbers in the thousands and one or two of the
51     other cells has numbers less than 5, you can still use
52     Fisher's Exact Test. For very large tables (where all four
53     entries in the two by two table are large), your computer may
54     take too much time to compute Fisher's Exact Test. In these
55     situations, though, you might as well use the Chi-square test
56     because a large sample approximation (that the Chi-square test
57     relies on) is very reasonable. If all elements are larger than
58     10 a Chi-square test is reasonable to use.
59     
60     @note The statistica assumes that each column and row sum,
61     respectively, are fixed. Just because you have a 2x2 table, this
62     assumtion does not necessarily match you experimental upset. See
63     e.g. Barnard's test for alternative.
64  */
65 
66  class Fisher : public Score
67  {
68 
69  public:
70    ///
71    /// Default Constructor.
72    ///
73    Fisher(bool absolute=true);
74
75    ///
76    /// Destructor
77    ///
78    virtual ~Fisher(void) {};
79         
80   
81    ///
82    /// @return Chi2 score
83    ///
84    double Chi2(void) const;
85
86    ///
87    /// Cutoff sets the limit whether a value should go into the left
88    /// or the right row. @see score
89    ///
90    /// @return reference to cutoff for row
91    ///
92    inline double& value_cutoff(void) { return value_cutoff_; }
93
94    ///
95    /// Calculates the expected values under the null hypothesis.
96    /// a' = (a+c)(a+b)/(a+b+c+d)
97    ///
98    void expected(double& a, double& b, double& c, double& d) const;
99
100    ///
101    /// minimum_size is the threshold for when the p-value calculation
102    /// is performed using a Chi2 approximation.
103    ///
104    /// @return reference to minimum_size
105    ///
106    inline u_int& minimum_size(void){ return minimum_size_; } 
107
108    ///
109    /// If absolute, the p-value is the two-sided p-value. If all
110    /// elements in table is at least minimum_size, a Chi2
111    /// approximation is used.
112    ///
113    /// @return p-value
114    ///
115    /// @note in weighted case, approximation Chi2 is always used.
116    ///
117    double p_value() const;
118   
119    ///
120    /// Function calculating score from 2x2 table for which the
121    /// elements are calculated as follows \n
122    /// target.binary(i) sample i in group a or c otherwise in b or d
123    /// \f$ value(i) > \f$ value_cutoff() sample i in group a or b
124    /// otherwise c or d\n
125    ///
126    /// @return odds ratio. If absolute_ is true and odds ratio is
127    /// less than unity 1 divided by odds ratio is returned
128    ///
129    double score(const classifier::Target& target, 
130                 const utility::vector& value);
131
132    ///
133    /// Weighted version of score. Each element in 2x2 table is
134    /// calculated as \f$ \sum w_i \f$, so when each weight is
135    /// unitary the same table is created as in the unweighted version
136    ///
137    /// @return odds ratio
138    ///
139    /// @see score
140    ///
141    double score(const classifier::Target& target, 
142                 const classifier::DataLookupWeighted1D& value);
143
144
145    ///
146    /// Weighted version of score. Each element in 2x2 table is
147    /// calculated as \f$ \sum w_i \f$, so when each weight is
148    /// unitary the same table is created as in the unweighted version
149    ///
150    /// @return odds ratio
151    ///
152    /// @see score
153    ///
154    double score(const classifier::Target& target, 
155                 const utility::vector& value,
156                 const utility::vector& weight); 
157
158    ///
159    /// \f$ \frac{ad}{bc} \f$
160    ///
161    /// @return odds ratio. If absolute_ is true and odds ratio is
162    /// less than unity, 1 divided by odds ratio is returned
163    ///
164    double score(const u_int a, const u_int b, 
165                 const u_int c, const u_int d); 
166   
167
168         
169  private:
170    double oddsratio(const double a, const double b, 
171                     const double c, const double d);
172
173    // two-sided
174    double p_value_approximative(void) const;
175    //two-sided
176    double p_value_exact(void) const;
177
178    double a_;
179    double b_;
180    double c_;
181    double d_;
182    u_int minimum_size_;
183    double oddsratio_;
184    double value_cutoff_;
185  };
186
187}} // of namespace statistics and namespace theplu
188
189#endif
190
Note: See TracBrowser for help on using the repository browser.