source: trunk/yat/statistics/Fisher.h @ 718

Last change on this file since 718 was 718, checked in by Jari Häkkinen, 15 years ago

Addresses #170.

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 5.6 KB
Line 
1#ifndef _theplu_yat_statistics_fisher_
2#define _theplu_yat_statistics_fisher_
3
4// $Id: Fisher.h 718 2006-12-26 09:56:26Z jari $
5
6/*
7  Copyright (C) The authors contributing to this file.
8
9  This file is part of the yat library, http://lev.thep.lu.se/trac/yat
10
11  The yat library is free software; you can redistribute it and/or
12  modify it under the terms of the GNU General Public License as
13  published by the Free Software Foundation; either version 2 of the
14  License, or (at your option) any later version.
15
16  The yat library is distributed in the hope that it will be useful,
17  but WITHOUT ANY WARRANTY; without even the implied warranty of
18  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19  General Public License for more details.
20
21  You should have received a copy of the GNU General Public License
22  along with this program; if not, write to the Free Software
23  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
24  02111-1307, USA.
25*/
26
27#include "Score.h"
28#include "yat/utility/vector.h"
29
30#include <cmath>
31
32namespace theplu {
33namespace yat {
34namespace statistics { 
35  /**
36     @brief Fisher's exact test.   
37
38     Fisher's Exact test is a procedure that you can use for data
39     in a two by two contingency table: \f[ \begin{tabular}{|c|c|}
40     \hline a&b \tabularnewline \hline c&d \tabularnewline \hline
41     \end{tabular} \f] Fisher's Exact Test is based on exact
42     probabilities from a specific distribution (the hypergeometric
43     distribution). There's really no lower bound on the amount of
44     data that is needed for Fisher's Exact Test. You do have to
45     have at least one data value in each row and one data value in
46     each column. If an entire row or column is zero, then you
47     don't really have a 2 by 2 table. But you can use Fisher's
48     Exact Test when one of the cells in your table has a zero in
49     it. Fisher's Exact Test is also very useful for highly
50     imbalanced tables. If one or two of the cells in a two by two
51     table have numbers in the thousands and one or two of the
52     other cells has numbers less than 5, you can still use
53     Fisher's Exact Test. For very large tables (where all four
54     entries in the two by two table are large), your computer may
55     take too much time to compute Fisher's Exact Test. In these
56     situations, though, you might as well use the Chi-square test
57     because a large sample approximation (that the Chi-square test
58     relies on) is very reasonable. If all elements are larger than
59     10 a Chi-square test is reasonable to use.
60     
61     @note The statistica assumes that each column and row sum,
62     respectively, are fixed. Just because you have a 2x2 table, this
63     assumtion does not necessarily match you experimental upset. See
64     e.g. Barnard's test for alternative.
65  */
66 
67  class Fisher : public Score
68  {
69 
70  public:
71    ///
72    /// Default Constructor.
73    ///
74    Fisher(bool absolute=true);
75
76    ///
77    /// Destructor
78    ///
79    virtual ~Fisher(void) {};
80         
81   
82    ///
83    /// @return Chi2 score
84    ///
85    double Chi2(void) const;
86
87    ///
88    /// Calculates the expected values under the null hypothesis.
89    /// a' = (a+c)(a+b)/(a+b+c+d)
90    ///
91    void expected(double& a, double& b, double& c, double& d) const;
92
93    ///
94    /// minimum_size is the threshold for when the p-value calculation
95    /// is performed using a Chi2 approximation.
96    ///
97    /// @return reference to minimum_size
98    ///
99    u_int& minimum_size(void);
100
101    ///
102    /// If absolute, the p-value is the two-sided p-value. If all
103    /// elements in table is at least minimum_size, a Chi2
104    /// approximation is used.
105    ///
106    /// @return p-value
107    ///
108    /// @note in weighted case, approximation Chi2 is always used.
109    ///
110    double p_value() const;
111   
112    ///
113    /// Function calculating score from 2x2 table for which the
114    /// elements are calculated as follows \n
115    /// target.binary(i) sample i in group a or c otherwise in b or d
116    /// \f$ value(i) > \f$ value_cutoff() sample i in group a or b
117    /// otherwise c or d\n
118    ///
119    /// @return odds ratio. If absolute_ is true and odds ratio is
120    /// less than unity 1 divided by odds ratio is returned
121    ///
122    double score(const classifier::Target& target, 
123                 const utility::vector& value);
124
125    ///
126    /// Weighted version of score. Each element in 2x2 table is
127    /// calculated as \f$ \sum w_i \f$, so when each weight is
128    /// unitary the same table is created as in the unweighted version
129    ///
130    /// @return odds ratio
131    ///
132    /// @see score
133    ///
134    double score(const classifier::Target& target, 
135                 const classifier::DataLookupWeighted1D& value);
136
137
138    ///
139    /// Weighted version of score. Each element in 2x2 table is
140    /// calculated as \f$ \sum w_i \f$, so when each weight is
141    /// unitary the same table is created as in the unweighted version
142    ///
143    /// @return odds ratio
144    ///
145    /// @see score
146    ///
147    double score(const classifier::Target& target, 
148                 const utility::vector& value,
149                 const utility::vector& weight); 
150
151    ///
152    /// \f$ \frac{ad}{bc} \f$
153    ///
154    /// @return odds ratio. If absolute_ is true and odds ratio is
155    /// less than unity, 1 divided by odds ratio is returned
156    ///
157    double score(const u_int a, const u_int b, 
158                 const u_int c, const u_int d); 
159
160    ///
161    /// Cutoff sets the limit whether a value should go into the left
162    /// or the right row. @see score
163    ///
164    /// @return reference to cutoff for row
165    ///
166    double& value_cutoff(void);
167
168  private:
169    double oddsratio(const double a, const double b, 
170                     const double c, const double d);
171
172    // two-sided
173    double p_value_approximative(void) const;
174    //two-sided
175    double p_value_exact(void) const;
176
177    double a_;
178    double b_;
179    double c_;
180    double d_;
181    u_int minimum_size_;
182    double oddsratio_;
183    double value_cutoff_;
184  };
185
186}}} // of namespace statistics, yat, and theplu
187
188#endif
Note: See TracBrowser for help on using the repository browser.