# Changeset 1404 for trunk/yat/statistics

Ignore:
Timestamp:
Aug 8, 2008, 12:47:18 AM (14 years ago)
Message:

refs #366 - weighted percentile
Added a class Percentile that can calculate the percentile for both
unweighted and weighted ranges. In order to make sense of the weighted
case, I had to modify the definition of percentile (slightly). The old
function percentile is using the old definition, but a new function,
percentile2, is using the new function and is simply calling the
Percentile class. The median is the same for the two definitions and
therefore it makes no difference which function to call, but to enable
calculation of weighted median percentile2 is called.

Location:
trunk/yat/statistics
Files:
2 edited

### Legend:

Unmodified
 r1339 */ #include "yat/utility/DataWeight.h" #include "yat/utility/iterator_traits.h" #include "yat/utility/yat_assert.h" #include "yat/utility/WeightIterator.h" #include #include #include #include #include #include #include namespace theplu { \param perc percentile to calculate [0,100]. Defaule value is 50, which implies class will calculate median. \param sorted if true class assues that ranges are already \param sorted if true class assumes that ranges are already sorted, if false the range will copied to a new range which is sorted. /** The percentile is found by interpolation, using the formula \f$percentile = (1 - \delta) x_i + \delta x_{i+1} \f$ where \a p is floor\f$((n - 1)p/100)\f$ and \f$\delta \f$ is \f$(n-1)p/100 - i \f$. Thus the minimum value of the range is given by perc equal to zero, the maximum is given by perc equal to 100 and the median value is given by perc equal to 50. Function is a non-mutable function, i.e., \a first and \a last can be const_iterators. template double operator()(RandomAccessIterator first, RandomAccessIterator last) const; /* RandomAccessIterator last) const { return calculate(first, last, sorted_, typename utility::weighted_iterator_traits::type()); typename utility::weighted_iterator_traits::type()); } */ private: double perc_; bool sorted_; // unweighted version - NOTE range must be sorted // unweighted version template double calculate(RandomAccessIterator first, RandomAccessIterator last, utility::unweighted_iterator_tag tag) const; bool sorted, utility::unweighted_iterator_tag tag) const; // weighted version - NOTE range must be sorted // weighted version template double calculate(RandomAccessIterator first, RandomAccessIterator last, utility::weighted_iterator_tag tag) const; bool sorted, utility::weighted_iterator_tag tag) const; // using compiler generated copy // unweighted version template double Percentiler::operator()(RandomAccessIterator first, RandomAccessIterator last) const double Percentiler::calculate(RandomAccessIterator first, RandomAccessIterator last, bool sorted, utility::unweighted_iterator_tag tag) const { // range is one value only is a special case if (first+1 == last) return *first; // just for convenience typedef typename utility::weighted_iterator_traits::type weighted_tag; if (sorted_){ if (perc_>=100) return utility::iterator_traits().data(first); if (sorted) { size_t n = last - first; // have to take care of this special case if (perc_>= 100.0 - 50.0/n) return *(--last); return calculate(first, last, weighted_tag()); if (perc_<= 50.0/n) return *first; double j = n * (perc_-50.0/n) / 100.0; int i = static_cast(j); return (1-j+floor(j))*first[i] + (j-floor(j))*first[i+1]; } std::vector::value_type> v_copy; std::vector v_copy; v_copy.reserve(std::distance(first,last)); std::copy(first, last, std::back_inserter(v_copy)); size_t i = static_cast(perc_/100 * (v_copy.size()-1)); if (i+2 < v_copy.size()) { std::partial_sort(v_copy.begin(), v_copy.begin()+i+2, v_copy.end()); } else std::sort(v_copy.begin(), v_copy.end()); return calculate(v_copy.begin(), v_copy.end(), weighted_tag()); } // unweighted version template double Percentiler::calculate(RandomAccessIterator first, RandomAccessIterator last, utility::unweighted_iterator_tag tag) const { double j = perc_/100 * (std::distance(first,last)-1); int i = static_cast(j); return (1-j+floor(j))*first[i] + (j-floor(j))*first[i+1]; std::sort(v_copy.begin(), v_copy.end()); return calculate(v_copy.begin(), v_copy.end(), true, tag); } double Percentiler::calculate(RandomAccessIterator first, RandomAccessIterator last, bool sorted, utility::weighted_iterator_tag tag) const { assert(false && "not implemented"); return 0; if (sorted) { utility::iterator_traits trait; std::vector accum_w; accum_w.reserve(last-first); std::partial_sum(weight_iterator(first), weight_iterator(last), std::back_inserter(accum_w)); double sum_w=0; double w_bound=perc_/100.0*accum_w.back(); RandomAccessIterator upper(first); while (sum_w + 0.5*trait.weight(upper) < w_bound) { sum_w += trait.weight(upper); ++upper; } if (upper==last) return trait.data(--last); if (sum_w==0.0) { while (first!=last) if (trait.weight(first)) return trait.data(first); // all weights zero, hmm return 0; } // lower should have non-zero weight // and (lower, upper) should have zero weights RandomAccessIterator lower=upper-1; while (trait.weight(lower)==0) --lower; double p1=100.0/accum_w.back()*(sum_w + 0.5*trait.weight(upper)); double p0=100.0/accum_w.back()*(sum_w - 0.5*trait.weight(lower)); double alpha = (perc_-p0)/(p1-p0); utility::yat_assert(alpha<=1.0, "Percentile: alpha out of bounds"); return trait.data(lower)+alpha*(trait.data(upper)-trait.data(lower)); } std::vector v_copy; v_copy.reserve(last-first); std::copy(first, last, std::back_inserter(v_copy)); std::sort(v_copy.begin(), v_copy.end()); return calculate(v_copy.begin(), v_copy.end(), true, tag); }