#ifndef _theplu_yat_statistics_utility_
#define _theplu_yat_statistics_utility_
// $Id: utility.h 1338 2008-06-06 22:13:20Z peter $
/*
Copyright (C) 2004 Jari Häkkinen, Peter Johansson
Copyright (C) 2005 Peter Johansson
Copyright (C) 2006 Jari Häkkinen, Peter Johansson, Markus Ringnér
Copyright (C) 2007 Jari Häkkinen, Peter Johansson
Copyright (C) 2008 Peter Johansson
This file is part of the yat library, http://trac.thep.lu.se/yat
The yat library is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License as
published by the Free Software Foundation; either version 2 of the
License, or (at your option) any later version.
The yat library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
02111-1307, USA.
*/
#include "Percentiler.h"
#include "yat/classifier/DataLookupWeighted1D.h"
#include "yat/classifier/Target.h"
#include "yat/utility/VectorBase.h"
#include "yat/utility/yat_assert.h"
#include
#include
#include
#include
#include
namespace theplu {
namespace yat {
namespace statistics {
/**
\brief 50th percentile
@see Percentiler
*/
template
double median(T first, T last, const bool sorted=false);
/**
\see Percentiler
*/
template
double percentile(T first, T last, double p, bool sorted=false);
/**
Adding a range [\a first, \a last) into an object of type T. The
requirements for the type T is to have an add(double, bool, double)
function.
*/
template
void add(T& o, ForwardIterator first, ForwardIterator last,
const classifier::Target& target)
{
for (size_t i=0; first!=last; ++i, ++first)
o.add(utility::iterator_traits().data(first),
target.binary(i),
utility::iterator_traits().weight(first));
}
///
/// Calculates the probability to get \a k or smaller from a
/// hypergeometric distribution with parameters \a n1 \a n2 \a
/// t. Hypergeomtric situation you get in the following situation:
/// Let there be \a n1 ways for a "good" selection and \a n2 ways
/// for a "bad" selection out of a total of possibilities. Take \a
/// t samples without replacement and \a k of those are "good"
/// samples. \a k will follow a hypergeomtric distribution.
///
/// @return cumulative hypergeomtric distribution functions P(k).
///
/// \deprecated Provided for backward compatibility with the 0.4
/// API. Use gsl_cdf_hypergeometric_P
///
double cdf_hypergeometric_P(unsigned int k, unsigned int n1,
unsigned int n2, unsigned int t);
/**
\brief one-sided p-value
This function uses the t-distribution to calculate the one-sided
p-value. Given that the true correlation is zero (Null
hypothesis) the estimated correlation, r, after a transformation
is t-distributed:
\f$ \sqrt{(n-2)} \frac{r}{\sqrt{(1-r^2)}} \in t(n-2) \f$
\return Probability that correlation is larger than \a r by
chance when having \a n samples.
*/
double pearson_p_value(double r, unsigned int n);
///
/// @brief Computes the kurtosis of the data in a vector.
///
/// The kurtosis measures how sharply peaked a distribution is,
/// relative to its width. The kurtosis is normalized to zero for a
/// gaussian distribution.
///
double kurtosis(const utility::VectorBase&);
///
/// @brief Median absolute deviation from median
///
/// Function is non-mutable function
///
template
double mad(T first, T last, const bool sorted=false)
{
double m = median(first, last, sorted);
std::vector ad;
ad.reserve(std::distance(first, last));
for( ; first!=last; ++first)
ad.push_back(fabs(*first-m));
std::sort(ad.begin(), ad.end());
return median(ad.begin(), ad.end(), true);
}
///
/// Median is defined to be value in the middle. If number of values
/// is even median is the average of the two middle values. the
/// median value is given by p equal to 50. If \a sorted is false
/// (default), the range is copied, the copy is sorted, and then
/// used to calculate the median.
///
/// Function is a non-mutable function, i.e., \a first and \a last
/// can be const_iterators.
///
/// Requirements: T should be an iterator over a range of doubles (or
/// any type being convertable to double).
///
/// @return median of range
///
template
double median(T first, T last, const bool sorted=false)
{ return percentile(first, last, 50.0, sorted); }
/**
The percentile is determined by the \a p, a number between 0 and
100. The percentile is found by interpolation, using the formula
\f$ percentile = (1 - \delta) x_i + \delta x_{i+1} \f$ where \a
p is floor\f$((n - 1)p/100)\f$ and \f$ \delta \f$ is \f$
(n-1)p/100 - i \f$.Thus the minimum value of the vector is given
by p equal to zero, the maximum is given by p equal to 100 and
the median value is given by p equal to 50. If @a sorted
is false (default), the vector is copied, the copy is sorted,
and then used to calculate the median.
Function is a non-mutable function, i.e., \a first and \a last
can be const_iterators.
Requirements: T should be an iterator over a range of doubles (or
any type being convertable to double). If \a sorted is false
iterator must be mutable, else read-only iterator is also ok.
@return \a p'th percentile of range
*/
template
double percentile(T first, T last, double p, bool sorted=false)
{
Percentiler percentiler(p, sorted);
return percentiler(first, last);
}
///
/// @brief Computes the skewness of the data in a vector.
///
/// The skewness measures the asymmetry of the tails of a
/// distribution.
///
double skewness(const utility::VectorBase&);
}}} // of namespace statistics, yat, and theplu
#endif