Changeset 3441


Ignore:
Timestamp:
Nov 23, 2015, 9:20:08 AM (6 years ago)
Author:
Peter
Message:

closes #841

Location:
trunk
Files:
3 edited

Legend:

Unmodified
Added
Removed
  • trunk/test/fisher.cc

    r3298 r3441  
    126126  f.minimum_size() = 1000;
    127127  f.oddsratio(10,20,10,20);
    128   suite.add(suite.equal(f.p_value(), 1.0));
     128  suite.add(suite.equal(f.p_value(), 1.0, 20));
    129129
    130130  f.oddsratio(10, 20, 20, 200);
  • trunk/yat/statistics/Fisher.cc

    r3425 r3441  
    8686
    8787
     88  /*
     89    ( n )
     90    (k+1)   (k+1)!(n-k)!
     91    ----- = ------------
     92     (n)    k!(n-k-1)!
     93     (k)
     94   */
     95  double Fisher::choose_ratio(unsigned int n, unsigned int k) const
     96  {
     97    if (k==0)
     98      return n;
     99    if (k==n-1)
     100      return k+1;
     101    return static_cast<double>(n-k) / (k+1);
     102  }
     103
     104
     105  double Fisher::hypergeometric_ratio(unsigned int k, unsigned int n1,
     106                                      unsigned int n2, unsigned int t) const
     107  {
     108    return choose_ratio(n1, k) / choose_ratio(n2, t-k-1);
     109  }
     110
     111  double Fisher::lower_tail(unsigned int k, unsigned int n1, unsigned int n2,
     112                            unsigned int t) const
     113  {
     114    double sum = 0;
     115
     116    // P(X=k)
     117    double p0 = gsl_ran_hypergeometric_pdf(k, n1, n2, t);
     118
     119    // minimum possible outcome for X is max(0, t-n2)
     120    unsigned int i = std::max(n2, t)-n2;
     121    double p = gsl_ran_hypergeometric_pdf(i, n1, n2, t);
     122    // avoid double dipping P(k)
     123    for ( ; i<k; ++i) {
     124      if (p<=p0)
     125        sum += p;
     126      else
     127        break;
     128
     129      // calculate p(i+1) using recursive function
     130      p *= hypergeometric_ratio(i, n1, n2, t);
     131    }
     132
     133    return sum;
     134  }
     135
     136
    88137  unsigned int& Fisher::minimum_size(void)
    89138  {
     
    131180      return p_value_approximative()/2.0;
    132181    }
     182    return p_left_exact();
     183  }
     184
     185
     186  double Fisher::p_left_exact(void) const
     187  {
    133188    // check for overflow
    134189    assert(c_ <= c_+d_ && d_ <= c_+d_);
     
    155210      return p_value_approximative()/2.0;
    156211    }
     212    return p_right_exact();
     213  }
     214
     215
     216  double Fisher::p_right_exact(void) const
     217  {
    157218    // check for overflow
    158219    assert(c_ <= c_+d_ && d_ <= c_+d_);
     
    178239  double Fisher::p_value_exact(void) const
    179240  {
    180     double res=0;
    181     double a, c, tmp;
    182     expected(a, tmp, c, tmp);
    183     // sum P(x) over x for which abs(x-E(a))>=abs(a-E(a))
    184 
    185     // counting left tail
    186     int k = static_cast<int>(a - std::abs(a_-a));
    187     if (k>=0)
    188       res = cdf_hypergeometric_P(k, a_+b_, c_+d_, a_+c_);
    189     // counting right tail
    190     k = static_cast<int>(c - std::abs(c_-c));
    191     if (k>=0)
    192       res += cdf_hypergeometric_P(k, c_+d_, a_+b_, a_+c_);
    193 
    194     // avoid p>1 due to double counting middle one
    195     return std::min(1.0, res);
     241    // The hypergeometric distribution is unimodal (only one peak) and the
     242    // peak (mode) occurs at: (t+1)*(n1+1)/(n+1)
     243
     244    double mode = static_cast<double>(a_+c_+1)*static_cast<double>(a_+b_+1)/
     245      static_cast<double>(a_+b_+c_+d_+1);
     246    if (a_ >= mode)
     247      return p_value_exact(a_, a_+b_, c_+d_, a_+c_);
     248    return p_value_exact(c_, c_+d_, a_+b_, a_+c_);
     249  }
     250
     251
     252  /*
     253    a | b | n1
     254    c | d | n2
     255    -------------------
     256    t |   | n
     257
     258
     259    The p is calculated as the sum of all P(x) for all x such that
     260    P(x)<=P(k)
     261  */
     262  double Fisher::p_value_exact(unsigned int k, unsigned int n1,
     263                               unsigned int n2, unsigned int t) const
     264  {
     265    assert(k<=t);
     266    assert(t<=n1+n2);
     267    assert(n1);
     268    assert(n2);
     269
     270    assert(k);
     271    // P(X >= k) = P(X > k-1)
     272    double sum = gsl_cdf_hypergeometric_Q(k-1, n1, n2, t);
     273    // calculate the other tail
     274    sum += lower_tail(k, n1, n2, t);
     275    return sum;
    196276  }
    197277
  • trunk/yat/statistics/Fisher.h

    r3425 r3441  
    148148       hypergeometric distribution
    149149       \f$ \sum_k P(k) \f$ where summation runs over \a k such that
    150        \f$ |k-<a>| \ge |a-<a>| \f$.
     150       \f$ P(k) \le P(a) \f$.
    151151
    152152       \return two-sided p-value
     
    195195    // two-sided
    196196    double p_value_approximative(void) const;
    197     //two-sided
     197    // two-sided
    198198    double p_value_exact(void) const;
     199    // calculate two-sided p-value to get k (or fewer) wins when
     200    // drawing t samples out of of a population of n1 wins and n2 losses
     201    double p_value_exact(unsigned int k, unsigned int n1, unsigned int n2,
     202                         unsigned int t) const;
     203
     204    double lower_tail(unsigned int k, unsigned int n1, unsigned int n2,
     205                      unsigned int t) const;
     206
     207    // return P(X=k+1) / P(X=k)
     208    double hypergeometric_ratio(unsigned int k, unsigned int n1,
     209                                unsigned int n2, unsigned int t) const;
     210    double choose_ratio(unsigned int n, unsigned int k) const;
     211
     212    double p_left_exact(void) const;
     213    double p_right_exact(void) const;
    199214
    200215    double yates(unsigned int o, double e) const;
Note: See TracChangeset for help on using the changeset viewer.