source: trunk/yat/statistics/utility.h @ 3137

Last change on this file since 3137 was 3137, checked in by Peter, 8 years ago

closes #772. new function that calculates mutual information

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 13.0 KB
Line 
1#ifndef _theplu_yat_statistics_utility_
2#define _theplu_yat_statistics_utility_
3
4// $Id: utility.h 3137 2013-11-28 04:11:23Z peter $
5
6/*
7  Copyright (C) 2004 Jari Häkkinen, Peter Johansson
8  Copyright (C) 2005 Peter Johansson
9  Copyright (C) 2006 Jari Häkkinen, Peter Johansson, Markus Ringnér
10  Copyright (C) 2007, 2008 Jari Häkkinen, Peter Johansson
11  Copyright (C) 2009, 2010, 2011, 2013 Peter Johansson
12
13  This file is part of the yat library, http://dev.thep.lu.se/yat
14
15  The yat library is free software; you can redistribute it and/or
16  modify it under the terms of the GNU General Public License as
17  published by the Free Software Foundation; either version 3 of the
18  License, or (at your option) any later version.
19
20  The yat library is distributed in the hope that it will be useful,
21  but WITHOUT ANY WARRANTY; without even the implied warranty of
22  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
23  General Public License for more details.
24
25  You should have received a copy of the GNU General Public License
26  along with yat. If not, see <http://www.gnu.org/licenses/>.
27*/
28
29#include "Percentiler.h"
30
31#include "yat/classifier/DataLookupWeighted1D.h"
32#include "yat/classifier/Target.h"
33#include "yat/normalizer/utility.h"
34#include "yat/utility/concept_check.h"
35#include "yat/utility/DataIterator.h"
36#include "yat/utility/deprecate.h"
37#include "yat/utility/iterator_traits.h"
38#include "yat/utility/Vector.h"
39#include "yat/utility/VectorBase.h"
40#include "yat/utility/yat_assert.h"
41
42#include <boost/concept_check.hpp>
43#include <gsl/gsl_math.h>
44#include <gsl/gsl_statistics_double.h>
45
46#include <algorithm>
47#include <cmath>
48#include <iterator>
49#include <stdexcept>
50#include <vector>
51
52namespace theplu {
53namespace yat {
54namespace statistics {
55
56  /**
57     Adding a range [\a first, \a last) into an object of type T. The
58     requirements for the type T is to have an add(double, bool, double)
59     function.
60  */
61  template <typename T, typename ForwardIterator>
62  void add(T& o, ForwardIterator first, ForwardIterator last,
63           const classifier::Target& target);
64
65  /**
66     \brief Benjamini Hochberg multiple test correction
67
68     Given a sorted range of p-values such that
69     \f$ p_1 \le p_2 \le ... \le p_N \f$ a Benjamnini-Hochberg corrected
70     p-value, \c q, is calculated recursively as
71     \f$ q_i = \f$ min \f$(p_i \frac{N}{i}, q_{i+1})\f$ with the anchor
72     constraint that \f$ q_m = p_m \f$.
73
74     Requirements: \c BidirectionalIterator1 should be a
75     \bidirectional_iterator and \c BidirectionalIterator2 should be a
76     mutable \bidirectional_iterator
77
78     \since New in yat 0.8
79   */
80  template<typename BidirectionalIterator1, typename BidirectionalIterator2>
81  void benjamini_hochberg(BidirectionalIterator1 first,
82                          BidirectionalIterator1 last,
83                          BidirectionalIterator2 result);
84
85
86  ///
87  /// Calculates the probability to get \a k or smaller from a
88  /// hypergeometric distribution with parameters \a n1 \a n2 \a
89  /// t. Hypergeomtric situation you get in the following situation:
90  /// Let there be \a n1 ways for a "good" selection and \a n2 ways
91  /// for a "bad" selection out of a total of possibilities. Take \a
92  /// t samples without replacement and \a k of those are "good"
93  /// samples. \a k will follow a hypergeomtric distribution.
94  ///
95  /// @return cumulative hypergeomtric distribution functions P(k).
96  ///
97  double cdf_hypergeometric_P(unsigned int k, unsigned int n1, 
98                              unsigned int n2, unsigned int t);
99
100
101  /**
102     The entropy is calculated as \f$ - \sum_i p_i \log p_i \f$ where
103     \f$p_i = \frac{n_i}{\sum_j n_j} \f$
104
105     Requirements:
106     - \c InputIterator should be an \input_iterator
107     - \c InputIterator::value_type must be convertible to \c double
108
109     \since New in yat 0.12
110   */
111  template<typename InputIterator>
112  double entropy(InputIterator first, InputIterator last);
113
114  /**
115     \brief one-sided p-value
116
117     This function uses the t-distribution to calculate the one-sided
118     p-value. Given that the true correlation is zero (Null
119     hypothesis) the estimated correlation, r, after a transformation
120     is t-distributed:
121
122     \f$ \sqrt{(n-2)} \frac{r}{\sqrt{(1-r^2)}} \in t(n-2) \f$
123
124     \return Probability that correlation is larger than \a r by
125     chance when having \a n samples. For \a r larger or equal to 1.0,
126     0.0 is returned. For \a r smaller or equal to -1.0, 1.0 is
127     returned.
128   */
129  double pearson_p_value(double r, unsigned int n);
130
131  ///
132  /// @brief Computes the kurtosis of the data in a vector.
133  ///
134  /// The kurtosis measures how sharply peaked a distribution is,
135  /// relative to its width. The kurtosis is normalized to zero for a
136  /// gaussian distribution.
137  ///
138  double kurtosis(const utility::VectorBase&);
139
140
141  ///
142  /// @brief Median absolute deviation from median
143  ///
144  /// Function is non-mutable function
145  ///
146  /// Requirements: \c RandomAccessIterator should be a \ref
147  /// concept_data_iterator and \random_access_iterator
148  ///
149  /// Since 0.6 function also work with a \ref
150  /// concept_weighted_iterator
151  ///
152  template <class RandomAccessIterator>
153  double mad(RandomAccessIterator first, RandomAccessIterator last,
154             bool sorted=false);
155
156
157  ///
158  /// Median is defined to be value in the middle. If number of values
159  /// is even median is the average of the two middle values.  the
160  /// median value is given by p equal to 50. If \a sorted is false
161  /// (default), the range is copied, the copy is sorted, and then
162  /// used to calculate the median.
163  ///
164  /// Function is a non-mutable function, i.e., \a first and \a last
165  /// can be const_iterators.
166  ///
167  /// Requirements: \c RandomAccessIterator should be a \ref
168  /// concept_data_iterator and \random_access_iterator
169  ///
170  /// @return median of range
171  ///
172  template <class RandomAccessIterator>
173  double median(RandomAccessIterator first, RandomAccessIterator last,
174                bool sorted=false);
175
176
177  /**
178     \brief Calculates the mutual information of \a A.
179
180     The elements in A are unnormalized probabilies of the joint
181     distribution.
182
183     The mutual information is calculated as \f$ \sum \sum p(x,y) \log_2
184     \frac {p(x,y)} {p(x)p(y)} \f$ where
185     \f$ p(x,y) = \frac {A_{xy}}{\sum_{x,y} A_{xy}} \f$;
186     \f$ p(x) = \sum_y A_{xy} / \sum_{x,y} A_{xy} \f$;
187     \f$ p(y) = \sum_x A_{xy} / \sum_{x,y} A_{xy} \f$
188
189     Requirements:
190     - \c T must be a model of \ref concept_container_2d
191     - \c T::value_type must be convertible to \c double
192
193     \return mutual information in bits; if you want in natural base
194     multiply with \c M_LN2 (defined in \c gsl/gsl_math.h )
195
196     \since New in yat 0.12
197   */
198  template<class T>
199  double mutual_information(const T& A);
200
201  /**
202     The percentile is determined by the \a p, a number between 0 and
203     100. The percentile is found by interpolation, using the formula
204     \f$ percentile = (1 - \delta) x_i + \delta x_{i+1} \f$ where \a
205     p is floor\f$((n - 1)p/100)\f$ and \f$ \delta \f$ is \f$
206     (n-1)p/100 - i \f$.Thus the minimum value of the vector is given
207     by p equal to zero, the maximum is given by p equal to 100 and
208     the median value is given by p equal to 50. If @a sorted
209     is false (default), the vector is copied, the copy is sorted,
210     and then used to calculate the median.
211
212     Function is a non-mutable function, i.e., \a first and \a last
213     can be const_iterators.
214
215     Requirements: RandomAccessIterator is an iterator over a range of
216     doubles (or any type being convertable to double).
217
218     @return \a p'th percentile of range
219
220     \deprecated percentile2 will replace this function in the future
221
222     \note the definition of percentile used here is not identical to
223     that one used in percentile2 and Percentile. The difference is
224     smaller for large ranges.
225  */
226  template <class RandomAccessIterator>
227  double percentile(RandomAccessIterator first, RandomAccessIterator last,
228                    double p, bool sorted=false) YAT_DEPRECATE;
229
230  /**
231     \see Percentiler
232
233     \since new in yat 0.5
234   */
235  template <class RandomAccessIterator>
236  double percentile2(RandomAccessIterator first, RandomAccessIterator last,
237                     double p, bool sorted=false)
238  {
239    Percentiler percentiler(p, sorted);
240    return percentiler(first, last);
241  }
242
243  ///
244  /// @brief Computes the skewness of the data in a vector.
245  ///
246  /// The skewness measures the asymmetry of the tails of a
247  /// distribution.
248  ///
249  double skewness(const utility::VectorBase&);
250 
251
252  //////// template implementations ///////////
253
254  template <typename T, typename ForwardIterator>
255  void add(T& o, ForwardIterator first, ForwardIterator last,
256           const classifier::Target& target)
257  {
258    utility::iterator_traits<ForwardIterator> traits;
259    for (size_t i=0; first!=last; ++i, ++first)
260      o.add(traits.data(first), target.binary(i), traits.weight(first));
261  } 
262
263
264  template<typename BidirectionalIterator1, typename BidirectionalIterator2>
265  void benjamini_hochberg(BidirectionalIterator1 first,
266                          BidirectionalIterator1 last,
267                          BidirectionalIterator2 result)
268  {
269    using boost::Mutable_BidirectionalIterator;
270    BOOST_CONCEPT_ASSERT((boost::BidirectionalIterator<BidirectionalIterator1>));
271    BOOST_CONCEPT_ASSERT((Mutable_BidirectionalIterator<BidirectionalIterator2>));
272    size_t n = std::distance(first, last);
273    if (!n)
274      return;
275    std::advance(result, n-1);
276    first = last;
277    --first;
278    size_t rank = n;
279   
280    double prev = 1.0;
281    while (rank) {
282      *result = std::min(*first * n/static_cast<double>(rank), prev);
283      prev = *result;
284      --rank;
285      --first;
286      --result;
287    }
288  }
289
290
291  template<typename InputIterator>
292  double entropy(InputIterator first, InputIterator last)
293  {
294    BOOST_CONCEPT_ASSERT((boost::InputIterator<InputIterator>));
295    using boost::Convertible;
296    typedef typename InputIterator::value_type T;
297    BOOST_CONCEPT_ASSERT((Convertible<T,double>));
298    double sum = 0;
299    double N = 0;
300    for (; first != last; ++first) {
301      if (*first) {
302        N += *first;
303        sum += *first * std::log(static_cast<double>(*first));
304      }
305    }
306    return -sum / N + log(N);
307  }
308
309
310  template <class RandomAccessIterator>
311  double mad(RandomAccessIterator first, RandomAccessIterator last, 
312             bool sorted)
313  {
314    BOOST_CONCEPT_ASSERT((boost::RandomAccessIterator<RandomAccessIterator>));
315    BOOST_CONCEPT_ASSERT((utility::DataIteratorConcept<RandomAccessIterator>));
316    double m = median(first, last, sorted);
317    typedef typename std::iterator_traits<RandomAccessIterator>::value_type T;
318    std::vector<T> ad(std::distance(first, last));
319    // copy weights if needed
320    normalizer::detail::copy_weight_if_weighted(first, last, ad.begin());
321    // assign data (absolute deviation from median)
322    utility::iterator_traits<RandomAccessIterator> traits;
323    utility::DataIterator<typename std::vector<T>::iterator> first2(ad.begin());
324    while (first!=last) {
325      *first2 = std::abs(traits.data(first)-m);
326      ++first;
327      ++first2;
328    }
329    std::sort(ad.begin(), ad.end());
330    return median(ad.begin(), ad.end(), true);
331  }
332
333
334  template <class RandomAccessIterator>
335  double median(RandomAccessIterator first, RandomAccessIterator last,
336                bool sorted)
337  {
338    return percentile2(first, last, 50.0, sorted);
339  }
340
341
342  template<class T>
343  double mutual_information(const T& n)
344  {
345    BOOST_CONCEPT_ASSERT((utility::Container2D<T>));
346    using boost::Convertible;
347    BOOST_CONCEPT_ASSERT((Convertible<typename T::value_type,double>));
348
349    // p_x = \sum_y p_xy
350
351    // Mutual Information is defined as
352    // \sum_xy p_xy * log (p_xy / (p_x p_y)) =
353    // \sum_xy p_xy * [log p_xy - log p_x - log p_y]
354    // \sum_xy p_xy log p_xy - p_xy log p_x - p_xy log p_y
355    // \sum_xy p_xy log p_xy - \sum_x p_x log p_x - \sum_y p_y log p_y
356    // - entropy_xy + entropy_x + entropy_y
357
358    utility::Vector rowsum(n.columns(), 0);
359    for (size_t c = 0; c<n.columns(); ++c)
360      rowsum(c) = std::accumulate(n.begin_column(c), n.end_column(c), 0);
361
362    utility::Vector colsum(n.rows(), 0);
363    for (size_t r = 0; r<n.rows(); ++r)
364      colsum(r) = std::accumulate(n.begin_row(r), n.end_row(r), 0);
365
366    double mi  = - entropy(n.begin(), n.end());
367    mi += entropy(rowsum.begin(), rowsum.end());
368    mi += entropy(colsum.begin(), colsum.end());
369
370    return mi/M_LN2;
371  }
372
373
374  template <class RandomAccessIterator>
375  double percentile(RandomAccessIterator first, RandomAccessIterator last,
376                    double p, bool sorted)
377  {
378    BOOST_CONCEPT_ASSERT((boost::RandomAccessIterator<RandomAccessIterator>));
379    BOOST_CONCEPT_ASSERT((utility::DataIteratorConcept<RandomAccessIterator>));
380    // range is one value only is a special case
381    if (first+1 == last)
382      return utility::iterator_traits<RandomAccessIterator>().data(first);
383    if (sorted) {
384      // have to take care of this special case
385      if (p>=100)
386        return utility::iterator_traits<RandomAccessIterator>().data(--last);
387      double j = p/100 * (std::distance(first,last)-1);
388      int i = static_cast<int>(j);
389      return (1-j+floor(j))*first[i] + (j-floor(j))*first[i+1];
390    }
391    using std::iterator_traits;
392    typedef typename iterator_traits<RandomAccessIterator>::value_type value_t;
393    std::vector<value_t> v_copy;
394    v_copy.reserve(std::distance(first,last));
395    std::copy(first, last, std::back_inserter(v_copy));
396    size_t i = static_cast<size_t>(p/100 * (v_copy.size()-1));
397    if (i+2 < v_copy.size()) {
398      std::partial_sort(v_copy.begin(), v_copy.begin()+i+2, v_copy.end());
399    }
400    else
401      std::sort(v_copy.begin(), v_copy.end());
402    return percentile(v_copy.begin(), v_copy.end(), p, true);
403  }
404
405
406}}} // of namespace statistics, yat, and theplu
407
408#endif
Note: See TracBrowser for help on using the repository browser.