source: trunk/yat/statistics/Fisher.cc @ 3743

Last change on this file since 3743 was 3743, checked in by Peter, 5 years ago

merge release 0.15.1 into trunk

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 6.7 KB
Line 
1// $Id: Fisher.cc 3743 2018-07-12 00:43:25Z peter $
2
3/*
4  Copyright (C) 2004 Jari Häkkinen, Peter Johansson
5  Copyright (C) 2005 Peter Johansson
6  Copyright (C) 2006, 2007, 2008 Jari Häkkinen, Peter Johansson
7  Copyright (C) 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2018 Peter Johansson
8
9  This file is part of the yat library, http://dev.thep.lu.se/yat
10
11  The yat library is free software; you can redistribute it and/or
12  modify it under the terms of the GNU General Public License as
13  published by the Free Software Foundation; either version 3 of the
14  License, or (at your option) any later version.
15
16  The yat library is distributed in the hope that it will be useful,
17  but WITHOUT ANY WARRANTY; without even the implied warranty of
18  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19  General Public License for more details.
20
21  You should have received a copy of the GNU General Public License
22  along with yat. If not, see <http://www.gnu.org/licenses/>.
23*/
24
25#include <config.h>
26
27#include "Fisher.h"
28#include "utility.h"
29
30#include "yat/utility/Exception.h"
31
32#include <gsl/gsl_cdf.h>
33#include <gsl/gsl_randist.h>
34
35#include <algorithm>
36#include <cassert>
37
38namespace theplu {
39namespace yat {
40namespace statistics {
41
42
43  Fisher::Fisher(bool yates)
44    : a_(0), b_(0), c_(0), d_(0), minimum_size_(10), oddsratio_(1.0),
45      yates_(yates)
46  {
47  }
48
49
50  Fisher::~Fisher()
51  {
52  }
53
54
55  bool Fisher::calculate_p_exact(void) const
56  {
57    return ( a_<minimum_size_ || b_<minimum_size_ ||
58             c_<minimum_size_ || d_<minimum_size_);
59  }
60
61  double Fisher::Chi2(void) const
62  {
63    double a,b,c,d;
64    expected(a,b,c,d);
65    if (yates_)
66      return yates(a_, a) + yates(b_, b) + yates(c_, c) + yates(d_, d);
67
68    return (a-a_)*(a-a_)/a + (b-b_)*(b-b_)/b +
69      (c-c_)*(c-c_)/c + (d-d_)*(d-d_)/d;
70  }
71
72
73  void Fisher::expected(double& a, double& b, double& c, double& d) const
74  {
75    // use floting point arithmetic to avoid overflow
76    double a1 = a_;
77    double b1 = b_;
78    double c1 = c_;
79    double d1 = d_;
80    double N = a1 + b1 + c1 + d1;
81    a =((a1+b1)*(a1+c1)) / N;
82    b =((a1+b1)*(b1+d1)) / N;
83    c =((c1+d1)*(a1+c1)) / N;
84    d =((c1+d1)*(b1+d1)) / N;
85  }
86
87
88  /*
89   (n)       n!
90   ( )  = --------
91   (k)    k!(n-k)!
92
93
94
95    ( n )
96    (k+1)   k!(n-k)!
97    ----- = ------------ = (n-k) / (k+1)
98     (n)    (k+1)!(n-k-1)!
99     (k)
100   */
101  double Fisher::choose_ratio(unsigned int n, unsigned int k) const
102  {
103    assert(k+1 <= n);
104    return static_cast<double>(n-k) / (k+1);
105  }
106
107
108  double Fisher::hypergeometric_ratio(unsigned int k, unsigned int n1,
109                                      unsigned int n2, unsigned int t) const
110  {
111    // P(X = k+1) / P(X = k) =
112    // (choose(n1, k+1) * choose(n2, t-k-1) / choose(n1+n2, t) ) /
113    // (choose(n1, k) * choose(n2, t-k) / choose(n1+n2, t) ) =
114    // choose(n1, k+1) / choose(n1, k) /( choose(n2, t-k)/choose(n2,t-k-1) ) =
115    // choose(n1,k+1)/choose(n1,k) / (choose(n2,(t-k-1)+1)/choose(n2,t-k-1))
116    // choose_ratio(n1,k) / choose_ratio(n2, t-k-1)
117    // where choose_ratio(a, b) = choose(a, b+1) / choose(a,b)
118    return choose_ratio(n1, k) / choose_ratio(n2, t-k-1);
119  }
120
121  double Fisher::lower_tail(unsigned int k, unsigned int n1, unsigned int n2,
122                            unsigned int t) const
123  {
124    double sum = 0;
125
126    // P(X=k)
127    double p0 = gsl_ran_hypergeometric_pdf(k, n1, n2, t);
128
129    // minimum possible outcome for X is max(0, t-n2)
130    unsigned int i = std::max(n2, t)-n2;
131    double p = gsl_ran_hypergeometric_pdf(i, n1, n2, t);
132    // avoid double dipping P(k)
133    for ( ; i<k; ++i) {
134      if (p<=p0)
135        sum += p;
136      else
137        break;
138
139      // calculate p(i+1) using recursive function
140      p *= hypergeometric_ratio(i, n1, n2, t);
141    }
142
143    return sum;
144  }
145
146
147  unsigned int& Fisher::minimum_size(void)
148  {
149    return minimum_size_;
150  }
151
152
153  const unsigned int& Fisher::minimum_size(void) const
154  {
155    return minimum_size_;
156  }
157
158
159  double Fisher::oddsratio(const unsigned int a,
160                           const unsigned int b,
161                           const unsigned int c,
162                           const unsigned int d)
163  {
164    // If a column sum or a row sum is zero, the table is nonsense
165    if ((a==0 || d==0) && (c==0 || b==0)){
166      throw utility::runtime_error("Table in Fisher is not valid\n");
167    }
168    a_ = a;
169    b_ = b;
170    c_ = c;
171    d_ = d;
172
173    oddsratio_= (static_cast<double>(a) / static_cast<double>(b) *
174                 static_cast<double>(d) / static_cast<double>(c) );
175    return oddsratio_;
176  }
177
178
179  double Fisher::oddsratio(void) const
180  {
181    return oddsratio_;
182  }
183
184
185  double Fisher::p_left(void) const
186  {
187    if (!calculate_p_exact()) {
188      if (oddsratio_>1.0)
189        return 1.0-p_value_approximative()/2.0;
190      return p_value_approximative()/2.0;
191    }
192    return p_left_exact();
193  }
194
195
196  double Fisher::p_left_exact(void) const
197  {
198    // check for overflow
199    assert(c_ <= c_+d_ && d_ <= c_+d_);
200    assert(a_ <= a_+b_ && b_ <= a_+b_);
201    assert(a_ <= a_+c_ && c_ <= a_+c_);
202
203    return cdf_hypergeometric_P(a_, a_+b_, c_+d_, a_+c_);
204  }
205
206
207  double Fisher::p_value(void) const
208  {
209    if (calculate_p_exact())
210      return p_value_exact();
211    return p_value_approximative();
212  }
213
214
215  double Fisher::p_right(void) const
216  {
217    if (!calculate_p_exact()) {
218      if (oddsratio_<1.0)
219        return 1.0-p_value_approximative()/2.0;
220      return p_value_approximative()/2.0;
221    }
222    return p_right_exact();
223  }
224
225
226  double Fisher::p_right_exact(void) const
227  {
228    // check for overflow
229    assert(c_ <= c_+d_ && d_ <= c_+d_);
230    assert(a_ <= a_+b_ && b_ <= a_+b_);
231    assert(a_ <= a_+c_ && c_ <= a_+c_);
232
233    return cdf_hypergeometric_P(c_, c_+d_, a_+b_, a_+c_);
234  }
235
236
237  double Fisher::p_value_one_sided(void) const
238  {
239    return p_right();
240  }
241
242
243  double Fisher::p_value_approximative(void) const
244  {
245    return gsl_cdf_chisq_Q(Chi2(), 1.0);
246  }
247
248
249  double Fisher::p_value_exact(void) const
250  {
251    // The hypergeometric distribution is unimodal (only one peak) and the
252    // peak (mode) occurs at: floor((t+1)*(n1+1)/(n+1))
253
254    double mode = std::floor((a_+c_+1.0)*(a_+b_+1.0)/(a_+b_+c_+d_+1.0));
255    if (a_ >= mode)
256      return p_value_exact(a_, a_+b_, c_+d_, a_+c_);
257    return p_value_exact(c_, c_+d_, a_+b_, a_+c_);
258  }
259
260
261  /*
262    a | b | n1
263    c | d | n2
264    -------------------
265    t |   | n
266
267
268    The p is calculated as the sum of all P(x) for all x such that
269    P(x)<=P(k)
270  */
271  double Fisher::p_value_exact(unsigned int k, unsigned int n1,
272                               unsigned int n2, unsigned int t) const
273  {
274    assert(k<=t);
275    assert(t<=n1+n2);
276    assert(n1);
277    assert(n2);
278
279    // special case when k=0 and mode (peak of distribution) is at
280    // zero as well. If mode is not zero 2x2 table should have been
281    // mirrored before calling this function.
282    if (k == 0) {
283      assert(std::floor((a_+c_+1.0)*(a_+b_+1.0)/(a_+b_+c_+d_+1.0)) == 0.0);
284      return 1.0;
285    }
286
287    assert(k);
288    // P(X >= k) = P(X > k-1)
289    double sum = gsl_cdf_hypergeometric_Q(k-1, n1, n2, t);
290    // calculate the other tail
291    sum += lower_tail(k, n1, n2, t);
292    return sum;
293  }
294
295
296  double Fisher::yates(unsigned int o, double e) const
297  {
298    double x = std::abs(o - e) - 0.5;
299    return x*x/e;
300  }
301
302}}} // of namespace statistics, yat, and theplu
Note: See TracBrowser for help on using the repository browser.