Changeset 1404


Ignore:
Timestamp:
Aug 8, 2008, 12:47:18 AM (15 years ago)
Author:
Peter
Message:

refs #366 - weighted percentile
Added a class Percentile that can calculate the percentile for both
unweighted and weighted ranges. In order to make sense of the weighted
case, I had to modify the definition of percentile (slightly). The old
function percentile is using the old definition, but a new function,
percentile2, is using the new function and is simply calling the
Percentile class. The median is the same for the two definitions and
therefore it makes no difference which function to call, but to enable
calculation of weighted median percentile2 is called.

Location:
trunk
Files:
7 edited

Legend:

Unmodified
Added
Removed
  • trunk/test/iterator_test.cc

    r1403 r1404  
    110110    data_iter(aw2.begin());
    111111  suite.add(*data_iter == 1.0);
    112   suite.add(*make_data_iterator(aw2.begin()) == 1.0);
     112  suite.add(*data_iterator(aw2.begin()) == 1.0);
    113113  std::vector<double> stl_vec(1,0);
    114114  utility::DataIterator<std::vector<double>::iterator> 
     
    141141  utility::DataWeight element = *x_weighted.begin();
    142142  *(x_weighted.begin()+1) = element;
    143   double element_data = *make_data_iterator(x_weighted.begin());
     143  double element_data = *data_iterator(x_weighted.begin());
    144144  suite.add(element_data==x_weighted.begin()->data());
    145145
  • trunk/test/statistics_test.cc

    r1317 r1404  
    3030#include "yat/statistics/Average.h"
    3131#include "yat/statistics/utility.h"
     32#include "yat/utility/MatrixWeighted.h"
    3233#include "yat/utility/Vector.h"
    3334
     
    5657  data.resize(1);
    5758  statistics::median(data.begin(), data.end());
     59  // testing percentile2
     60  data.resize(5);
     61  std::copy(gsl_vec.begin(), gsl_vec.begin()+5, data.begin());
     62  std::map<double, double> correct;
     63  correct[0]=0;
     64  correct[10]=0;
     65  correct[30]=1;
     66  correct[40]=1.5;
     67  correct[50]=2;
     68  correct[70]=3;
     69  correct[90]=4;
     70  correct[100]=4;
     71  for (std::map<double, double>::const_iterator i=correct.begin();
     72       i!=correct.end(); ++i) {
     73    if (!suite.add(suite.equal(statistics::percentile2(data.begin(), data.end(),
     74                                                       i->first),
     75                               i->second))) {
     76      suite.err() << "error: unweighted percentile2 " << i->first
     77                  << "th percentile = "
     78                  << statistics::percentile2(data.begin(),data.end(),i->first)
     79                  << " expected to be " << i->second
     80                  << std::endl;
     81    }
     82  }
     83
     84  utility::MatrixWeighted mw(1,data.size(),1);
     85  for (size_t i=0; i<mw.columns(); ++i)
     86    mw(0,i).data()=data[i];
     87  if (!suite.equal(statistics::median(mw.begin_row(0), mw.end_row(0), true),2))
     88    suite.err() << "error: weighted median sorted failed" << std::endl;
     89  if (!suite.equal(statistics::median(mw.begin_row(0), mw.end_row(0), false),2))
     90    suite.err() << "error: weighted median failed" << std::endl;
     91  for (std::map<double, double>::const_iterator i=correct.begin();
     92       i!=correct.end(); ++i) {
     93    if (!suite.add(suite.equal(statistics::percentile2(mw.begin_row(0),
     94                                                       mw.end_row(0),
     95                                                       i->first),
     96                               i->second))) {
     97      suite.err() << "error: weighted percentile2 " << i->first
     98                  << "th percentile = "
     99                  << statistics::percentile2(mw.begin_row(0),
     100                                             mw.end_row(0), i->first)
     101                  << " expected to be " << i->second
     102                  << std::endl;
     103    }
     104  }
    58105
    59106  double skewness_gsl=statistics::skewness(gsl_vec);
  • trunk/yat/statistics/Percentiler.h

    r1339 r1404  
    2525*/
    2626
     27#include "yat/utility/DataWeight.h"
    2728#include "yat/utility/iterator_traits.h"
     29#include "yat/utility/yat_assert.h"
     30#include "yat/utility/WeightIterator.h"
    2831
    2932#include <algorithm>
    3033#include <cassert>
    3134#include <cmath>
     35#include <numeric>
     36#include <stdexcept>
    3237#include <vector>
     38
     39#include <iostream>
    3340
    3441namespace theplu {
     
    4754       \param perc percentile to calculate [0,100]. Defaule value is
    4855       50, which implies class will calculate median.
    49        \param sorted if true class assues that ranges are already
     56       \param sorted if true class assumes that ranges are already
    5057       sorted, if false the range will copied to a new range which is
    5158       sorted.
     
    5461
    5562    /**
    56        The percentile is found by interpolation, using the formula \f$
    57        percentile = (1 - \delta) x_i + \delta x_{i+1} \f$ where \a p
    58        is floor\f$((n - 1)p/100)\f$ and \f$ \delta \f$ is \f$
    59        (n-1)p/100 - i \f$. Thus the minimum value of the range is
    60        given by perc equal to zero, the maximum is given by perc equal
    61        to 100 and the median value is given by perc equal to 50.
    62 
    6363       Function is a non-mutable function, i.e., \a first and \a last
    6464       can be const_iterators.
     
    6868    template<typename RandomAccessIterator>
    6969    double operator()(RandomAccessIterator first,
    70                       RandomAccessIterator last) const;
    71     /*
     70                      RandomAccessIterator last) const
    7271    {
    7372      return calculate(first, last, sorted_,
    74             typename utility::weighted_iterator_traits<RandomAccessIterator>::type());
     73       typename utility::weighted_iterator_traits<RandomAccessIterator>::type());
    7574    }
    76     */
    7775  private:
    7876    double perc_;
    7977    bool sorted_;
    8078
    81     // unweighted version - NOTE range must be sorted
     79    // unweighted version
    8280    template<typename RandomAccessIterator>
    8381    double calculate(RandomAccessIterator first, RandomAccessIterator last,
    84                      utility::unweighted_iterator_tag tag) const;
     82                     bool sorted, utility::unweighted_iterator_tag tag) const;
    8583
    86     // weighted version - NOTE range must be sorted
     84    // weighted version
    8785    template<typename RandomAccessIterator>
    8886    double calculate(RandomAccessIterator first, RandomAccessIterator last,
    89                      utility::weighted_iterator_tag tag) const;
     87                     bool sorted, utility::weighted_iterator_tag tag) const;
    9088
    9189    // using compiler generated copy
     
    10098  // unweighted version
    10199  template<typename RandomAccessIterator>
    102   double Percentiler::operator()(RandomAccessIterator first,
    103                                  RandomAccessIterator last) const
     100  double
     101  Percentiler::calculate(RandomAccessIterator first,
     102                         RandomAccessIterator last,
     103                         bool sorted,
     104                         utility::unweighted_iterator_tag tag) const
    104105  {
    105106    // range is one value only is a special case
    106107    if (first+1 == last)
    107       return *first;
    108     // just for convenience
    109     typedef
    110       typename utility::weighted_iterator_traits<RandomAccessIterator>::type
    111       weighted_tag;
    112 
    113     if (sorted_){
    114       if (perc_>=100)
     108      return utility::iterator_traits<RandomAccessIterator>().data(first);
     109    if (sorted) {
     110      size_t n = last - first;
     111      // have to take care of this special case
     112      if (perc_>= 100.0 - 50.0/n)
    115113        return *(--last);
    116       return calculate(first, last, weighted_tag());
     114      if (perc_<= 50.0/n)
     115        return *first;
     116      double j = n * (perc_-50.0/n) / 100.0;
     117      int i = static_cast<int>(j);
     118      return (1-j+floor(j))*first[i] + (j-floor(j))*first[i+1];
    117119    }
    118120
    119     std::vector<typename std::iterator_traits<RandomAccessIterator>::value_type>
    120       v_copy;
     121    std::vector<double> v_copy;
    121122    v_copy.reserve(std::distance(first,last));
    122123    std::copy(first, last, std::back_inserter(v_copy));
    123     size_t i = static_cast<size_t>(perc_/100 * (v_copy.size()-1));
    124     if (i+2 < v_copy.size()) {
    125       std::partial_sort(v_copy.begin(), v_copy.begin()+i+2, v_copy.end());
    126     }
    127     else
    128       std::sort(v_copy.begin(), v_copy.end());
    129     return calculate(v_copy.begin(), v_copy.end(), weighted_tag());
    130   }
    131  
    132   // unweighted version
    133   template<typename RandomAccessIterator>
    134   double Percentiler::calculate(RandomAccessIterator first,
    135                                 RandomAccessIterator last,
    136                                 utility::unweighted_iterator_tag tag) const
    137   {
    138     double j = perc_/100 * (std::distance(first,last)-1);
    139     int i = static_cast<int>(j);
    140     return (1-j+floor(j))*first[i] + (j-floor(j))*first[i+1];
     124    std::sort(v_copy.begin(), v_copy.end());
     125    return calculate(v_copy.begin(), v_copy.end(), true, tag);
    141126  }
    142127
     
    146131  double Percentiler::calculate(RandomAccessIterator first,
    147132                                RandomAccessIterator last,
     133                                bool sorted,
    148134                                utility::weighted_iterator_tag tag) const
    149135  {
    150     assert(false && "not implemented");
    151     return 0;
     136    if (sorted) {
     137      utility::iterator_traits<RandomAccessIterator> trait;
     138      std::vector<double> accum_w;
     139      accum_w.reserve(last-first);
     140      std::partial_sum(weight_iterator(first),
     141                       weight_iterator(last),
     142                       std::back_inserter(accum_w));
     143
     144      double sum_w=0;
     145      double w_bound=perc_/100.0*accum_w.back();
     146      RandomAccessIterator upper(first);
     147      while (sum_w + 0.5*trait.weight(upper) < w_bound) {
     148        sum_w += trait.weight(upper);
     149        ++upper;
     150      }
     151      if (upper==last)
     152        return trait.data(--last);
     153      if (sum_w==0.0) {
     154        while (first!=last)
     155          if (trait.weight(first))
     156            return trait.data(first);
     157        // all weights zero, hmm
     158        return 0;
     159      }
     160      // lower should have non-zero weight
     161      // and (lower, upper) should have zero weights
     162      RandomAccessIterator lower=upper-1;
     163      while (trait.weight(lower)==0)
     164        --lower;
     165
     166      double p1=100.0/accum_w.back()*(sum_w + 0.5*trait.weight(upper));
     167      double p0=100.0/accum_w.back()*(sum_w - 0.5*trait.weight(lower));
     168      double alpha = (perc_-p0)/(p1-p0);
     169      utility::yat_assert<std::runtime_error>(alpha<=1.0,
     170                                              "Percentile: alpha out of bounds");
     171      return trait.data(lower)+alpha*(trait.data(upper)-trait.data(lower));
     172    }
     173
     174    std::vector<utility::DataWeight> v_copy;
     175    v_copy.reserve(last-first);
     176    std::copy(first, last, std::back_inserter(v_copy));
     177    std::sort(v_copy.begin(), v_copy.end());
     178    return calculate(v_copy.begin(), v_copy.end(), true, tag);
    152179  }
    153180
  • trunk/yat/statistics/utility.h

    r1338 r1404  
    153153  template <class T>
    154154  double median(T first, T last, const bool sorted=false)
    155   { return percentile(first, last, 50.0, sorted); }
     155  { return percentile2(first, last, 50.0, sorted); }
    156156
    157157  /**
     
    170170     
    171171     Requirements: T should be an iterator over a range of doubles (or
    172      any type being convertable to double). If \a sorted is false
    173      iterator must be mutable, else read-only iterator is also ok.
     172     any type being convertable to double).
    174173     
    175174     @return \a p'th percentile of range
    176   */
    177   template <class T>
    178   double percentile(T first, T last, double p, bool sorted=false)
     175
     176     \deprecated percentile2 will replace this function in the future
     177
     178     \note the definition of percentile used here is not identical to
     179     that one used in percentile2 and Percentile. The difference is
     180     smaller for large ranges.
     181  */
     182  template <class RandomAccessIterator>
     183  double percentile(RandomAccessIterator first, RandomAccessIterator last,
     184                    double p, bool sorted=false)
     185  {
     186    // range is one value only is a special case
     187    if (first+1 == last)
     188      return utility::iterator_traits<RandomAccessIterator>().data(first);
     189    if (sorted) {
     190      // have to take care of this special case
     191      if (p>=100)
     192        return utility::iterator_traits<RandomAccessIterator>().data(--last);
     193      double j = p/100 * (std::distance(first,last)-1);
     194      int i = static_cast<int>(j);
     195      return (1-j+floor(j))*first[i] + (j-floor(j))*first[i+1];
     196    }
     197
     198    std::vector<typename std::iterator_traits<RandomAccessIterator>::value_type>
     199      v_copy;
     200    v_copy.reserve(std::distance(first,last));
     201    std::copy(first, last, std::back_inserter(v_copy));
     202    size_t i = static_cast<size_t>(p/100 * (v_copy.size()-1));
     203    if (i+2 < v_copy.size()) {
     204      std::partial_sort(v_copy.begin(), v_copy.begin()+i+2, v_copy.end());
     205    }
     206    else
     207      std::sort(v_copy.begin(), v_copy.end());
     208    return percentile(v_copy.begin(), v_copy.end(), p, true);
     209  }
     210
     211
     212  /**
     213     \see Percentiler
     214     
     215     \since new in yat 0.5
     216   */
     217  template <class RandomAccessIterator>
     218  double percentile2(RandomAccessIterator first, RandomAccessIterator last,
     219                     double p, bool sorted=false)
    179220  {
    180221    Percentiler percentiler(p, sorted);
  • trunk/yat/utility/DataIterator.h

    r1377 r1404  
    4242    DataIterator<Base>               // Derived
    4343    , Base                          // Base
    44     , boost::remove_reference<typename iterator_traits<Base>::data_reference>
     44    , typename boost::remove_reference<typename iterator_traits<Base>::data_reference>::type
    4545    , boost::forward_traversal_tag    // CategoryOrTraversal
    4646    , typename iterator_traits<Base>::data_reference // Reference
     
    8484   */
    8585  template<typename Base>
    86   DataIterator<Base> make_data_iterator(Base base)
     86  DataIterator<Base> data_iterator(Base base)
    8787  {
    8888    return DataIterator<Base>(base);
  • trunk/yat/utility/MatrixWeighted.cc

    r1384 r1404  
    124124    columns_ = data.columns();
    125125    resize(data.columns(), data.rows());
    126     std::copy(data.begin(), data.end(), make_data_iterator(vec_.begin()));
    127     std::copy(weight.begin(), weight.end(), make_weight_iterator(vec_.begin()));
     126    std::copy(data.begin(), data.end(), data_iterator(vec_.begin()));
     127    std::copy(weight.begin(), weight.end(), weight_iterator(vec_.begin()));
    128128  }
    129129
  • trunk/yat/utility/WeightIterator.h

    r1377 r1404  
    4040    WeightIterator<Base>               // Derived
    4141    , Base                          // Base
    42     , boost::remove_reference<typename iterator_traits<Base>::weight_reference>
     42    , typename boost::remove_reference<typename iterator_traits<Base>::weight_reference>::type
    4343    , boost::forward_traversal_tag    // CategoryOrTraversal
    4444    , typename iterator_traits<Base>::weight_reference // Reference
     
    8383   */
    8484  template<typename Base>
    85   WeightIterator<Base> make_weight_iterator(Base base)
     85  WeightIterator<Base> weight_iterator(Base base)
    8686  {
    8787    return WeightIterator<Base>(base);
Note: See TracChangeset for help on using the changeset viewer.