source: trunk/yat/statistics/Fisher.h @ 680

Last change on this file since 680 was 680, checked in by Jari Häkkinen, 15 years ago

Addresses #153. Introduced yat namespace. Removed alignment namespace. Clean up of code.

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 5.7 KB
Line 
1#ifndef _theplu_yat_statistics_fisher_
2#define _theplu_yat_statistics_fisher_
3
4// $Id: Fisher.h 680 2006-10-11 17:49:03Z jari $
5
6/*
7  Copyright (C) The authors contributing to this file.
8
9  This file is part of the yat library, http://lev.thep.lu.se/trac/yat
10
11  The yat library is free software; you can redistribute it and/or
12  modify it under the terms of the GNU General Public License as
13  published by the Free Software Foundation; either version 2 of the
14  License, or (at your option) any later version.
15
16  The yat library is distributed in the hope that it will be useful,
17  but WITHOUT ANY WARRANTY; without even the implied warranty of
18  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19  General Public License for more details.
20
21  You should have received a copy of the GNU General Public License
22  along with this program; if not, write to the Free Software
23  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
24  02111-1307, USA.
25*/
26
27#include "Score.h"
28#include "yat/utility/vector.h"
29
30#include <cmath>
31
32namespace theplu {
33namespace yat {
34namespace statistics { 
35  /**
36     @brief Fisher's exact test.   
37
38     Fisher's Exact test is a procedure that you can use for data
39     in a two by two contingency table: \f[ \begin{tabular}{|c|c|}
40     \hline a&b \tabularnewline \hline c&d \tabularnewline \hline
41     \end{tabular} \f] Fisher's Exact Test is based on exact
42     probabilities from a specific distribution (the hypergeometric
43     distribution). There's really no lower bound on the amount of
44     data that is needed for Fisher's Exact Test. You do have to
45     have at least one data value in each row and one data value in
46     each column. If an entire row or column is zero, then you
47     don't really have a 2 by 2 table. But you can use Fisher's
48     Exact Test when one of the cells in your table has a zero in
49     it. Fisher's Exact Test is also very useful for highly
50     imbalanced tables. If one or two of the cells in a two by two
51     table have numbers in the thousands and one or two of the
52     other cells has numbers less than 5, you can still use
53     Fisher's Exact Test. For very large tables (where all four
54     entries in the two by two table are large), your computer may
55     take too much time to compute Fisher's Exact Test. In these
56     situations, though, you might as well use the Chi-square test
57     because a large sample approximation (that the Chi-square test
58     relies on) is very reasonable. If all elements are larger than
59     10 a Chi-square test is reasonable to use.
60     
61     @note The statistica assumes that each column and row sum,
62     respectively, are fixed. Just because you have a 2x2 table, this
63     assumtion does not necessarily match you experimental upset. See
64     e.g. Barnard's test for alternative.
65  */
66 
67  class Fisher : public Score
68  {
69 
70  public:
71    ///
72    /// Default Constructor.
73    ///
74    Fisher(bool absolute=true);
75
76    ///
77    /// Destructor
78    ///
79    virtual ~Fisher(void) {};
80         
81   
82    ///
83    /// @return Chi2 score
84    ///
85    double Chi2(void) const;
86
87    ///
88    /// Cutoff sets the limit whether a value should go into the left
89    /// or the right row. @see score
90    ///
91    /// @return reference to cutoff for row
92    ///
93    inline double& value_cutoff(void) { return value_cutoff_; }
94
95    ///
96    /// Calculates the expected values under the null hypothesis.
97    /// a' = (a+c)(a+b)/(a+b+c+d)
98    ///
99    void expected(double& a, double& b, double& c, double& d) const;
100
101    ///
102    /// minimum_size is the threshold for when the p-value calculation
103    /// is performed using a Chi2 approximation.
104    ///
105    /// @return reference to minimum_size
106    ///
107    inline u_int& minimum_size(void){ return minimum_size_; } 
108
109    ///
110    /// If absolute, the p-value is the two-sided p-value. If all
111    /// elements in table is at least minimum_size, a Chi2
112    /// approximation is used.
113    ///
114    /// @return p-value
115    ///
116    /// @note in weighted case, approximation Chi2 is always used.
117    ///
118    double p_value() const;
119   
120    ///
121    /// Function calculating score from 2x2 table for which the
122    /// elements are calculated as follows \n
123    /// target.binary(i) sample i in group a or c otherwise in b or d
124    /// \f$ value(i) > \f$ value_cutoff() sample i in group a or b
125    /// otherwise c or d\n
126    ///
127    /// @return odds ratio. If absolute_ is true and odds ratio is
128    /// less than unity 1 divided by odds ratio is returned
129    ///
130    double score(const classifier::Target& target, 
131                 const utility::vector& value);
132
133    ///
134    /// Weighted version of score. Each element in 2x2 table is
135    /// calculated as \f$ \sum w_i \f$, so when each weight is
136    /// unitary the same table is created as in the unweighted version
137    ///
138    /// @return odds ratio
139    ///
140    /// @see score
141    ///
142    double score(const classifier::Target& target, 
143                 const classifier::DataLookupWeighted1D& value);
144
145
146    ///
147    /// Weighted version of score. Each element in 2x2 table is
148    /// calculated as \f$ \sum w_i \f$, so when each weight is
149    /// unitary the same table is created as in the unweighted version
150    ///
151    /// @return odds ratio
152    ///
153    /// @see score
154    ///
155    double score(const classifier::Target& target, 
156                 const utility::vector& value,
157                 const utility::vector& weight); 
158
159    ///
160    /// \f$ \frac{ad}{bc} \f$
161    ///
162    /// @return odds ratio. If absolute_ is true and odds ratio is
163    /// less than unity, 1 divided by odds ratio is returned
164    ///
165    double score(const u_int a, const u_int b, 
166                 const u_int c, const u_int d); 
167   
168
169         
170  private:
171    double oddsratio(const double a, const double b, 
172                     const double c, const double d);
173
174    // two-sided
175    double p_value_approximative(void) const;
176    //two-sided
177    double p_value_exact(void) const;
178
179    double a_;
180    double b_;
181    double c_;
182    double d_;
183    u_int minimum_size_;
184    double oddsratio_;
185    double value_cutoff_;
186  };
187
188}}} // of namespace statistics, yat and theplu
189
190#endif
191
Note: See TracBrowser for help on using the repository browser.