Changeset 2709


Ignore:
Timestamp:
Mar 15, 2012, 4:26:47 AM (11 years ago)
Author:
Peter
Message:

refs #211. Changed how number of samples, n, is calculated in large
sample approximation p-value calculation. Before the sum_w was used
and now sum(w)*sum(w)/sum(ww) is used.

Location:
trunk
Files:
3 edited

Legend:

Unmodified
Added
Removed
  • trunk/test/roc.cc

    r2708 r2709  
    275275  roc2.minimum_size() = 0;
    276276  if (!suite.equal(roc.p_value_one_sided(), roc2.p_value_one_sided())) {
    277     suite.xadd(false);
     277    suite.add(false);
    278278    suite.err() << "approximative p failed\n";
    279279  }
     
    333333{
    334334  suite.out() << "test p approx weighted\n";
    335   std::vector<double> x(100);
    336   std::vector<double> w(100, 1.0);
    337   std::deque<bool> label(100);
     335  std::vector<double> x(200);
     336  std::vector<double> w(200, 1.0);
     337  std::deque<bool> label(200);
    338338
    339339  for (size_t i=0; i<x.size(); ++i) {
    340340    x[i] = i;
    341341    label[i] = i>30 && i<70;
    342     w[i] = 100.0 / (i+100);
     342    if (i<100)
     343      w[i] = 100.0 / (100+i);
     344    else
     345      w[i] = 0.0001;
    343346  }
    344347
     
    346349  for (size_t i=0; i<x.size(); ++i)
    347350    roc.add(x[i], label[i], w[i]);
    348   suite.out() << "area: " << roc.area() << std::endl;
    349351  roc.minimum_size() = 0;
    350352  double p = roc.p_value_one_sided();
    351   suite.out() << "p: " << p << std::endl;
    352353
    353354  std::set<size_t> checkpoints;
    354   size_t perm = 1000000;
     355  size_t perm = 100000;
    355356  checkpoints.insert(10);
    356357  checkpoints.insert(100);
    357358  checkpoints.insert(1000);
    358359  checkpoints.insert(10000);
    359   checkpoints.insert(100000);
    360360  checkpoints.insert(perm);
    361361  statistics::Averager averager;
     
    374374        suite.err() << "error: approx p value and permutation p-value "
    375375                    << "deviate more than expected\n"
     376                    << "area: " << roc.area() << "\n"
    376377                    << "approx p: " << p << "\n"
    377378                    << "permutations: " << averager.n() << "\n"
    378379                    << "successful: " << averager.sum_x() << "\n"
    379380                    << "corresponds to P=" << averager.mean() << "\n";
    380         suite.xadd(false);
     381        suite.add(false);
    381382        return;
    382383      }
  • trunk/yat/statistics/ROC.cc

    r2697 r2709  
    44  Copyright (C) 2004, 2005 Peter Johansson
    55  Copyright (C) 2006, 2007, 2008 Jari Häkkinen, Peter Johansson
    6   Copyright (C) 2011 Peter Johansson
     6  Copyright (C) 2011, 2012 Peter Johansson
    77
    88  This file is part of the yat library, http://dev.thep.lu.se/yat
     
    5454    multimap_.insert(lower, element);
    5555    if (target)
    56       w_pos_+=w;
     56      pos_weights_.add(w);
    5757    else
    58       w_neg_+=w;
     58      neg_weights_.add(w);
    5959    area_ = std::numeric_limits<double>::quiet_NaN();
    6060  }
     
    7373  double ROC::get_p_approx(double x) const
    7474  {
     75    size_t n_pos = nof_points(pos_weights_);
     76    size_t n_neg = nof_points(neg_weights_);
     77    size_t nof_samples = n_pos + n_neg;
    7578    // make x standard normal
    7679    x -= 0.5;
    7780    // Not integrating from the middle of the bin, but from the inner edge.
    7881    if (x>0)
    79       x -= 0.5/(n_pos()*n_neg());
     82      x -= 0.5/(n_pos*n_neg);
    8083    else if(x<0)
    81       x += 0.5/(n_pos()*n_neg());
     84      x += 0.5/(n_pos*n_neg);
    8285    else
    8386      return 0.5;
    84     double var = 1.0+n();
     87    double var = 1.0+nof_samples;
    8588    if (has_ties_) {
    8689      double correction = 0;
     
    100103        mn/12 [ N+1 - 1/(N(N-1)) * sum(t(t-1)(t+1)) ]
    101104      */
    102       var -= correction/(n() * (n()-1));
    103     }
    104     var = var / (12*n_pos()*n_neg());
     105      var -= correction/(nof_samples * (nof_samples-1));
     106    }
     107    var = var / (12*n_pos*n_neg);
    105108    return gsl_cdf_gaussian_Q(x, std::sqrt(var));
    106109  }
     
    127130  double ROC::n_neg(void) const
    128131  {
    129     return w_neg_;
     132    return neg_weights_.sum_x();
    130133  }
    131134
     
    133136  double ROC::n_pos(void) const
    134137  {
    135     return w_pos_;
     138    return pos_weights_.sum_x();
     139  }
     140
     141
     142  size_t ROC::nof_points(const Averager& a) const
     143  {
     144    return a.sum_x()*a.sum_x()/a.sum_xx();
    136145  }
    137146
     
    140149  {
    141150    return p_exact_with_ties(multimap_.begin(), multimap_.end(),
    142                              area*w_pos_*w_neg_, w_pos_, w_neg_);
     151                             area*n_pos()*n_neg(), n_pos(), n_neg());
    143152  }
    144153
     
    159168      if (has_ties_) {
    160169        p += p_exact_with_ties(multimap_.rbegin(), multimap_.rend(),
    161                                abs_area*w_pos_*w_neg_, w_pos_, w_neg_);
     170                               abs_area*n_pos()*n_neg(), n_pos(), n_neg());
    162171      }
    163172      else
     
    189198    area_ = std::numeric_limits<double>::quiet_NaN();
    190199    has_ties_ = false;
    191     w_pos_=0;
    192     w_neg_=0;
     200    neg_weights_.reset();
     201    pos_weights_.reset();
    193202    multimap_.clear();
    194203  }
  • trunk/yat/statistics/ROC.h

    r2696 r2709  
    2424  along with yat. If not, see <http://www.gnu.org/licenses/>.
    2525*/
     26
     27#include "Averager.h"
    2628
    2729#include <gsl/gsl_randist.h>
     
    154156
    155157       where sum runs over different data values (of ties) and \f$ n_x
    156        \f$ is number data points with that value. The sum i a
     158       \f$ is number data points with that value. The sum is a
    157159       correction term for ties and is zero if there are no ties.
    158160
     161       The number of samples in a group, \f$ n^+ \f$, is calculated as
     162       \f$ n = (\sum w)^2 / \sum w^2 \f$
     163
    159164       \return \f$ P(a \ge \textrm{area}) \f$
    160 
    161        \note Weights should be -1, 0, or 1; otherwise the p-value is
    162        undefined and may change in future versions.
    163165     */
    164166    double p_value_one_sided(void) const;
     
    194196    double get_p_approx(double) const;
    195197
     198    /**
     199       return (sum x)^2 / sum x^2
     200     */
     201    size_t nof_points(const Averager& a) const;
     202
    196203    /*
    197204     */
     
    214221    bool has_ties_;
    215222    unsigned int minimum_size_;
    216     double w_neg_;
    217     double w_pos_;
     223    Averager neg_weights_;
     224    Averager pos_weights_;
    218225    Map multimap_;
    219226  };
Note: See TracChangeset for help on using the changeset viewer.