source: trunk/yat/statistics/utility.cc @ 4053

Last change on this file since 4053 was 4053, checked in by Peter, 8 months ago

add some test for correlation(Matrix); speed up, particuarly when #rows is much greater than #columns. Avoid copying and modifying the whole matrix, instead calculate simple sums and squared sums using BLAS functionality as much as possible.

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 2.7 KB
Line 
1// $Id: utility.cc 4053 2021-03-26 03:29:42Z peter $
2
3/*
4  Copyright (C) 2004 Jari Häkkinen, Peter Johansson
5  Copyright (C) 2005 Peter Johansson
6  Copyright (C) 2006, 2007, 2008 Jari Häkkinen, Peter Johansson
7  Copyright (C) 2009, 2011, 2012, 2018 Peter Johansson
8
9  This file is part of the yat library, http://dev.thep.lu.se/yat
10
11  The yat library is free software; you can redistribute it and/or
12  modify it under the terms of the GNU General Public License as
13  published by the Free Software Foundation; either version 3 of the
14  License, or (at your option) any later version.
15
16  The yat library is distributed in the hope that it will be useful,
17  but WITHOUT ANY WARRANTY; without even the implied warranty of
18  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19  General Public License for more details.
20
21  You should have received a copy of the GNU General Public License
22  along with yat. If not, see <http://www.gnu.org/licenses/>.
23*/
24
25#include <config.h>
26
27#include "utility.h"
28
29#include "yat/normalizer/Centralizer.h"
30#include "yat/normalizer/ColumnNormalizer.h"
31
32#include <gsl/gsl_cdf.h>
33#include <gsl/gsl_randist.h>
34#include <gsl/gsl_statistics_double.h>
35
36#include <algorithm>
37#include <cassert>
38#include <cmath>
39#include <limits>
40
41namespace theplu {
42namespace yat {
43namespace statistics {
44
45  double cdf_hypergeometric_P(unsigned int k, unsigned int n1,
46                              unsigned int n2, unsigned int t)
47  {
48    return gsl_cdf_hypergeometric_P(k, n1, n2, t);
49  }
50
51
52  utility::Matrix correlation(const utility::Matrix& X)
53  {
54    size_t cols = X.columns();
55    size_t rows = X.rows();
56    utility::Vector m(cols);
57    utility::Vector x2(cols);
58    utility::Vector stddev(cols);
59    for (size_t i=0; i<cols; ++i) {
60      utility::VectorConstView vec = X.column_const_view(i);
61      m(i) = sum(vec);
62      x2(i) = vec * vec;
63      // scaled standard deviation
64      stddev(i) = std::sqrt(x2(i) - m(i) * m(i) / rows);
65    }
66
67    utility::Matrix corr(cols, cols, 1.0);
68    for (size_t i=0; i<cols; ++i)
69      for (size_t j=i+1; j<cols; ++j) {
70        corr(i,j) =
71          X.column_const_view(i) * X.column_const_view(j) - m(i)*m(j)/rows;
72        corr(i, j) /= stddev(i) * stddev(j);
73        // symmetry
74        corr(j, i) = corr(i,j);
75      }
76
77    return corr;
78  }
79
80
81  double pearson_p_value(double r, unsigned int n)
82  {
83    assert(n>=2);
84    if (n<=2)
85      return std::numeric_limits<double>::quiet_NaN();
86    if (r>=1.0)
87      return 0.0;
88    if (r<=-1.0)
89      return 1.0;
90    return gsl_cdf_tdist_Q(r*std::sqrt((n-2)/(1-r*r)), n-2);
91  }
92
93
94  double kurtosis(const utility::VectorBase& v)
95  {
96    const gsl_vector* gvp=v.gsl_vector_p();
97    return gsl_stats_kurtosis(gvp->data,gvp->stride,gvp->size);
98  }
99
100
101  double skewness(const utility::VectorBase& v)
102  {
103    const gsl_vector* gvp=v.gsl_vector_p();
104    return gsl_stats_skew(gvp->data,gvp->stride,gvp->size);
105  }
106
107}}} // of namespace statistics, yat, and theplu
Note: See TracBrowser for help on using the repository browser.