Ignore:
Timestamp:
Aug 8, 2008, 12:47:18 AM (13 years ago)
Author:
Peter
Message:

refs #366 - weighted percentile
Added a class Percentile that can calculate the percentile for both
unweighted and weighted ranges. In order to make sense of the weighted
case, I had to modify the definition of percentile (slightly). The old
function percentile is using the old definition, but a new function,
percentile2, is using the new function and is simply calling the
Percentile class. The median is the same for the two definitions and
therefore it makes no difference which function to call, but to enable
calculation of weighted median percentile2 is called.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/yat/statistics/Percentiler.h

    r1339 r1404  
    2525*/
    2626
     27#include "yat/utility/DataWeight.h"
    2728#include "yat/utility/iterator_traits.h"
     29#include "yat/utility/yat_assert.h"
     30#include "yat/utility/WeightIterator.h"
    2831
    2932#include <algorithm>
    3033#include <cassert>
    3134#include <cmath>
     35#include <numeric>
     36#include <stdexcept>
    3237#include <vector>
     38
     39#include <iostream>
    3340
    3441namespace theplu {
     
    4754       \param perc percentile to calculate [0,100]. Defaule value is
    4855       50, which implies class will calculate median.
    49        \param sorted if true class assues that ranges are already
     56       \param sorted if true class assumes that ranges are already
    5057       sorted, if false the range will copied to a new range which is
    5158       sorted.
     
    5461
    5562    /**
    56        The percentile is found by interpolation, using the formula \f$
    57        percentile = (1 - \delta) x_i + \delta x_{i+1} \f$ where \a p
    58        is floor\f$((n - 1)p/100)\f$ and \f$ \delta \f$ is \f$
    59        (n-1)p/100 - i \f$. Thus the minimum value of the range is
    60        given by perc equal to zero, the maximum is given by perc equal
    61        to 100 and the median value is given by perc equal to 50.
    62 
    6363       Function is a non-mutable function, i.e., \a first and \a last
    6464       can be const_iterators.
     
    6868    template<typename RandomAccessIterator>
    6969    double operator()(RandomAccessIterator first,
    70                       RandomAccessIterator last) const;
    71     /*
     70                      RandomAccessIterator last) const
    7271    {
    7372      return calculate(first, last, sorted_,
    74             typename utility::weighted_iterator_traits<RandomAccessIterator>::type());
     73       typename utility::weighted_iterator_traits<RandomAccessIterator>::type());
    7574    }
    76     */
    7775  private:
    7876    double perc_;
    7977    bool sorted_;
    8078
    81     // unweighted version - NOTE range must be sorted
     79    // unweighted version
    8280    template<typename RandomAccessIterator>
    8381    double calculate(RandomAccessIterator first, RandomAccessIterator last,
    84                      utility::unweighted_iterator_tag tag) const;
     82                     bool sorted, utility::unweighted_iterator_tag tag) const;
    8583
    86     // weighted version - NOTE range must be sorted
     84    // weighted version
    8785    template<typename RandomAccessIterator>
    8886    double calculate(RandomAccessIterator first, RandomAccessIterator last,
    89                      utility::weighted_iterator_tag tag) const;
     87                     bool sorted, utility::weighted_iterator_tag tag) const;
    9088
    9189    // using compiler generated copy
     
    10098  // unweighted version
    10199  template<typename RandomAccessIterator>
    102   double Percentiler::operator()(RandomAccessIterator first,
    103                                  RandomAccessIterator last) const
     100  double
     101  Percentiler::calculate(RandomAccessIterator first,
     102                         RandomAccessIterator last,
     103                         bool sorted,
     104                         utility::unweighted_iterator_tag tag) const
    104105  {
    105106    // range is one value only is a special case
    106107    if (first+1 == last)
    107       return *first;
    108     // just for convenience
    109     typedef
    110       typename utility::weighted_iterator_traits<RandomAccessIterator>::type
    111       weighted_tag;
    112 
    113     if (sorted_){
    114       if (perc_>=100)
     108      return utility::iterator_traits<RandomAccessIterator>().data(first);
     109    if (sorted) {
     110      size_t n = last - first;
     111      // have to take care of this special case
     112      if (perc_>= 100.0 - 50.0/n)
    115113        return *(--last);
    116       return calculate(first, last, weighted_tag());
     114      if (perc_<= 50.0/n)
     115        return *first;
     116      double j = n * (perc_-50.0/n) / 100.0;
     117      int i = static_cast<int>(j);
     118      return (1-j+floor(j))*first[i] + (j-floor(j))*first[i+1];
    117119    }
    118120
    119     std::vector<typename std::iterator_traits<RandomAccessIterator>::value_type>
    120       v_copy;
     121    std::vector<double> v_copy;
    121122    v_copy.reserve(std::distance(first,last));
    122123    std::copy(first, last, std::back_inserter(v_copy));
    123     size_t i = static_cast<size_t>(perc_/100 * (v_copy.size()-1));
    124     if (i+2 < v_copy.size()) {
    125       std::partial_sort(v_copy.begin(), v_copy.begin()+i+2, v_copy.end());
    126     }
    127     else
    128       std::sort(v_copy.begin(), v_copy.end());
    129     return calculate(v_copy.begin(), v_copy.end(), weighted_tag());
    130   }
    131  
    132   // unweighted version
    133   template<typename RandomAccessIterator>
    134   double Percentiler::calculate(RandomAccessIterator first,
    135                                 RandomAccessIterator last,
    136                                 utility::unweighted_iterator_tag tag) const
    137   {
    138     double j = perc_/100 * (std::distance(first,last)-1);
    139     int i = static_cast<int>(j);
    140     return (1-j+floor(j))*first[i] + (j-floor(j))*first[i+1];
     124    std::sort(v_copy.begin(), v_copy.end());
     125    return calculate(v_copy.begin(), v_copy.end(), true, tag);
    141126  }
    142127
     
    146131  double Percentiler::calculate(RandomAccessIterator first,
    147132                                RandomAccessIterator last,
     133                                bool sorted,
    148134                                utility::weighted_iterator_tag tag) const
    149135  {
    150     assert(false && "not implemented");
    151     return 0;
     136    if (sorted) {
     137      utility::iterator_traits<RandomAccessIterator> trait;
     138      std::vector<double> accum_w;
     139      accum_w.reserve(last-first);
     140      std::partial_sum(weight_iterator(first),
     141                       weight_iterator(last),
     142                       std::back_inserter(accum_w));
     143
     144      double sum_w=0;
     145      double w_bound=perc_/100.0*accum_w.back();
     146      RandomAccessIterator upper(first);
     147      while (sum_w + 0.5*trait.weight(upper) < w_bound) {
     148        sum_w += trait.weight(upper);
     149        ++upper;
     150      }
     151      if (upper==last)
     152        return trait.data(--last);
     153      if (sum_w==0.0) {
     154        while (first!=last)
     155          if (trait.weight(first))
     156            return trait.data(first);
     157        // all weights zero, hmm
     158        return 0;
     159      }
     160      // lower should have non-zero weight
     161      // and (lower, upper) should have zero weights
     162      RandomAccessIterator lower=upper-1;
     163      while (trait.weight(lower)==0)
     164        --lower;
     165
     166      double p1=100.0/accum_w.back()*(sum_w + 0.5*trait.weight(upper));
     167      double p0=100.0/accum_w.back()*(sum_w - 0.5*trait.weight(lower));
     168      double alpha = (perc_-p0)/(p1-p0);
     169      utility::yat_assert<std::runtime_error>(alpha<=1.0,
     170                                              "Percentile: alpha out of bounds");
     171      return trait.data(lower)+alpha*(trait.data(upper)-trait.data(lower));
     172    }
     173
     174    std::vector<utility::DataWeight> v_copy;
     175    v_copy.reserve(last-first);
     176    std::copy(first, last, std::back_inserter(v_copy));
     177    std::sort(v_copy.begin(), v_copy.end());
     178    return calculate(v_copy.begin(), v_copy.end(), true, tag);
    152179  }
    153180
Note: See TracChangeset for help on using the changeset viewer.