Changeset 4037
- Timestamp:
- Jan 25, 2021, 5:08:28 AM (17 months ago)
- Location:
- branches/kendall-score
- Files:
-
- 3 added
- 4 edited
Legend:
- Unmodified
- Added
- Removed
-
branches/kendall-score/test/Makefile.am
r3987 r4037 101 101 test/poisson.test \ 102 102 test/queue.test test/queue2.test \ 103 test/random_distribution.test \ 104 test/range.test test/regression.test test/rnd.test \ 103 test/range.test \ 104 test/ranking.test \ 105 test/regression.test test/rnd.test \ 105 106 test/rng-mt.test \ 106 107 test/roc.test \ -
branches/kendall-score/test/kendall.cc
r3987 r4037 27 27 #include "yat/statistics/Kendall.h" 28 28 #include "yat/utility/Matrix.h" 29 #include "yat/utility/utility.h" 29 30 30 31 #include <gsl/gsl_cdf.h> … … 47 48 { 48 49 test::Suite suite(argc, argv); 49 50 test_add(suite); 51 test_copy(suite); 52 test_p(suite); 53 test_p_exact(suite); 54 test_p_with_ties(suite); 55 test_score(suite); 56 test_score_with_ties(suite); 50 using std::throw_with_nested; 51 try { 52 try { 53 test_add(suite); 54 } 55 catch (std::exception& e) { 56 throw_with_nested(std::runtime_error("test_add() failed")); 57 } 58 test_copy(suite); 59 test_p(suite); 60 test_p_exact(suite); 61 test_p_with_ties(suite); 62 try { 63 test_score(suite); 64 } 65 catch (std::exception& e) { 66 throw_with_nested(std::runtime_error("test_score() failed")); 67 } 68 try { 69 test_score_with_ties(suite); 70 } 71 catch (std::exception& e) { 72 throw_with_nested(std::runtime_error("test_score_with_ties() failed")); 73 } 74 } 75 catch (std::exception& e) { 76 suite.add(false); 77 suite.err() << "error: exception caught: what(): "; 78 utility::print_what(e, suite.err()); 79 } 57 80 return suite.return_value(); 58 81 } … … 62 85 { 63 86 suite.out() << YAT_TEST_PROLOGUE; 64 Kendall kendall; 87 suite.err() << "constructor\n"; 88 Kendall kendall; 89 suite.err() << "::add(0,0)\n"; 65 90 kendall.add(0,0); 66 kendall.add(1,1); 67 double score = kendall.score(); 68 if (!suite.add(suite.equal(score, 1.0))) 69 suite.err() << "score not 1.0\n"; 91 suite.err() << "::add(1,1)\n"; 92 kendall.add(1,1); 93 suite.err() << "::score()\n"; 94 try { 95 double score = kendall.score(); 96 if (!suite.add(suite.equal(score, 1.0))) 97 suite.err() << "score not 1.0\n"; 98 } 99 catch (std::exception& e) { 100 std::throw_with_nested(std::runtime_error("::score(void) failed")); 101 } 70 102 } 71 103 -
branches/kendall-score/yat/statistics/Kendall.cc
r3987 r4037 24 24 #include "Kendall.h" 25 25 26 #include <yat/utility/Ranking.h> 27 #include <yat/utility/stl_utility.h> 28 26 29 #include <gsl/gsl_cdf.h> 30 31 #include <boost/scoped_ptr.hpp> 27 32 28 33 #include <algorithm> 29 34 #include <cassert> 30 35 #include <cmath> 36 #include <iterator> 31 37 #include <limits> 32 38 #include <map> 39 #include <set> 33 40 #include <vector> 34 41 42 43 #include <iostream> // debug 35 44 namespace theplu { 36 45 namespace yat { … … 66 75 class Kendall::Pimpl 67 76 { 77 class Count 78 { 79 public: 80 Count(const std::multiset<std::pair<double, double> >& data); 81 long int count(void) const; 82 double score(void) const; 83 double variance(void) const; 84 class Ties 85 { 86 public: 87 Ties(void); 88 void add(size_t n); 89 bool have_ties(void) const { return n_pairs_; } 90 // \return \sum x * (x-1) 91 unsigned long int n_pairs(void) const { return n_pairs_;} 92 // \return \sum x * (x-1) * (x-2) 93 unsigned long int n_triples(void) const { return n_triples_; } 94 // \return \sum x * (x-1) * (2*x+5) 95 unsigned long int v_correction(void) const { return v_correction_; } 96 private: 97 unsigned long int n_pairs_; 98 unsigned long int n_triples_; 99 unsigned long int v_correction_; 100 }; 101 102 private: 103 Ties x_ties_; 104 Ties y_ties_; 105 106 // # pairs such that (x_i-x_j)(y_i-y_j) > 0 107 long int concordant_; 108 // # pairs such that (x_i-x_j)(y_i-y_j) < 0 109 long int disconcordant_; 110 // # pairs such that x_i!=x_j && y_i==y_j 111 long int extraX_; 112 // # pairs such that x_i==x_j && y_i!=y_j 113 long int extraY_; 114 // # pairs such that x_i==x_j && y_i==y_j 115 long int spare_; 116 117 template<typename Iterator> 118 void calculate_ties(Iterator first, Iterator last, Ties& ties) 119 { 120 while (first != last) { 121 Iterator upper = first; 122 size_t n = 1; 123 ++upper; 124 while (upper!=last && *upper==*first) { 125 ++n; 126 ++upper; 127 } 128 ties.add(n); 129 first = upper; 130 } 131 } 132 }; 133 68 134 public: 69 Pimpl(void) : updated_(false) {} 70 71 void add(double x, double y) 72 { 73 x_.push_back(x); 74 std::map<double, size_t>::iterator it = x_count_.lower_bound(x); 75 if (it==x_count_.end() || it->first != x) 76 x_count_.insert(it, std::make_pair(x, 1)); 77 else 78 ++it->second; 79 y_.push_back(y); 80 it = y_count_.lower_bound(y); 81 if (it==y_count_.end() || it->first != y) 82 y_count_.insert(it, std::make_pair(y, 1)); 83 else 84 ++it->second; 85 updated_ = false; 135 Pimpl(void); 136 Pimpl(const Pimpl& other); 137 Pimpl& operator=(const Pimpl& rhs); 138 void add(double x, double y); 139 size_t n(void) const; 140 /// \return one-sided p-value 141 double p_approx(bool right) const; 142 double p_exact(bool right, bool left) const; 143 void reset(void); 144 double score(void) const; 145 private: 146 // return # concordant pairs minus # discordant pairs 147 long int count(void) const; 148 // data always sort wrt first and then second (if first are equal) 149 std::multiset<std::pair<double, double> > data_; 150 // calculated in score(void) 151 boost::scoped_ptr<Count> count_; 152 }; 153 154 155 // Kendall class 156 Kendall::Kendall(void) 157 : pimpl_(new Pimpl) 158 { 159 } 160 161 162 Kendall::Kendall(const Kendall& rhs) 163 : pimpl_(new Pimpl(*rhs.pimpl_)) 164 { 165 } 166 167 168 Kendall::~Kendall(void) 169 { 170 delete pimpl_; 171 } 172 173 174 void Kendall::add(double x, double y) 175 { 176 pimpl_->add(x, y); 177 } 178 179 180 size_t Kendall::n(void) const 181 { 182 return pimpl_->n(); 183 } 184 185 186 double Kendall::score(void) const 187 { 188 return pimpl_->score(); 189 } 190 191 192 double Kendall::p_left(bool exact) const 193 { 194 if (!exact) 195 return pimpl_->p_approx(false); 196 return pimpl_->p_exact(false, true); 197 } 198 199 200 double Kendall::p_right(bool exact) const 201 { 202 if (!exact) 203 return pimpl_->p_approx(true); 204 return pimpl_->p_exact(true, false); 205 } 206 207 208 double Kendall::p_value(bool exact) const 209 { 210 if (exact) 211 return pimpl_->p_exact(true, true); 212 if (score()>0.0) 213 return 2*p_right(false); 214 return 2*p_left(false); 215 } 216 217 218 void Kendall::reset(void) 219 { 220 pimpl_->reset(); 221 } 222 223 224 Kendall& Kendall::operator=(const Kendall& rhs) 225 { 226 if (&rhs == this) 227 return *this; 228 229 assert(pimpl_); 230 assert(rhs.pimpl_); 231 *pimpl_ = *rhs.pimpl_; 232 return *this; 233 } 234 235 236 Kendall::Pimpl::Count::Count(const std::multiset<std::pair<double,double>>& data) 237 { 238 // We follow 3 Algorithm SDTau for some-duplicate datasets in 239 // 'Fast Algorithms For The Calculation Of Kendall's Tau' 240 // by David Christen (Computational Statistics, March 2005) 241 242 // data is sorted w.r.t. ::first 243 calculate_ties(utility::pair_first_iterator(data.begin()), 244 utility::pair_first_iterator(data.end()), 245 x_ties_); 246 unsigned int d = 0; 247 unsigned int e = 0; 248 utility::Ranking<double> Y; 249 250 for (auto it=data.cbegin(); it!=data.end(); ++it) { 251 if (it != data.begin()) { 252 auto previous = std::prev(it); 253 if (it->first != previous->first) { // x not equal 254 d = 0; 255 e = 1; 256 } 257 else if (it->second == previous->second) // y also equal 258 ++e; 259 else { // x equal, y not equal 260 d += e; 261 e = 1; 262 } 263 } 264 265 auto lower = Y.lower_bound(it->second); 266 // number of element in Y smaller than it->second 267 int n_smaller = Y.ranking(lower); 268 // number of element in Y equal to it->second 269 int n_equal = 1; 270 auto upper = std::next(lower); 271 while (upper!=Y.cend() && *upper==*lower) { 272 ++upper; 273 ++n_equal; 274 } 275 Y.insert(lower, it->second); 276 size_t i = Y.size(); 277 278 long int a = n_smaller - d; 279 long int b = n_equal - e; 280 long int c = i - (a+b+d+e-1); 281 282 extraY_ += d; 283 extraX_ += b; 284 concordant_ += a; 285 disconcordant_ += c; 86 286 } 87 88 89 size_t n(void) const 90 { 91 assert(y_.size()==x_.size()); 92 return x_.size(); 287 assert(0); 288 } 289 290 291 long int Kendall::Pimpl::Count::count(void) const 292 { 293 return concordant_ - disconcordant_; 294 } 295 296 297 double Kendall::Pimpl::Count::score(void) const 298 { 299 double numerator = count(); 300 double denominator = concordant_ + disconcordant_; 301 if (extraX_ || extraY_) { 302 denominator = 303 std::sqrt((denominator + extraX_)*(denominator + extraY_)); 93 304 } 94 95 96 /// \return one-sided p-value 97 double p_approx(bool right) const 98 { 99 double k = count(); 100 if (!right) 101 k = -k; 102 // avoid overflow 103 double n = this->n(); 104 /* 105 According to wikipedia, 106 z = k / sqrt(v) 107 is approximately standard normal 108 v = (v0 - vt - vu)/18 + v1 + v2 109 v0 = n(n-1)(2n+5) 110 vt = \sum t(t-1)(2t+5) 111 vu = \sum u(u-1)(2u+5) 112 v1 = \sum t(t-1)) * \sum u(u-1) / (2n(n-1)) 113 v2 = sum t(t-1)(t-2) \sum u(u-1)(u-2) / (9n(n-1)(n-2)) 114 115 where t is number of equal values in group i 116 and similarly u for y. 117 */ 118 double v0 = n*(n-1)*(2*n+5); 119 double vt = 0; 120 double vu = 0; 121 double v1 = 0; 122 double v2 = 0; 123 // all correction terms above are zero in absence of ties 124 bool x_have_ties = x_.size() != x_count_.size(); 125 bool y_have_ties = y_.size() != y_count_.size(); 126 if (x_have_ties || y_have_ties) { 127 if (x_have_ties) 128 vt = v_correction(x_count_); 129 if (y_have_ties) 130 vu = v_correction(y_count_); 131 if (x_have_ties && y_have_ties) { 132 v1 = count_pair(x_count_) * (count_pair(y_count_) / (2*n*(n-1))); 133 v2 = count_triple(x_count_); 305 return numerator / denominator; 306 } 307 308 309 double Kendall::Pimpl::Count::variance(void) const 310 { 311 /* 312 According to wikipedia, 313 z = k / sqrt(v) 314 is approximately standard normal 315 v = (v0 - vt - vu)/18 + v1 + v2 316 v0 = n(n-1)(2n+5) 317 vt = \sum t(t-1)(2t+5) 318 vu = \sum u(u-1)(2u+5) 319 v1 = \sum t(t-1)) * \sum u(u-1) / (2n(n-1)) 320 v2 = sum t(t-1)(t-2) \sum u(u-1)(u-2) / (9n(n-1)(n-2)) 321 322 where t is number of equal values in group i and similarly u for 323 y. 324 */ 325 double n = score(); 326 double v0 = n*(n-1)*(2*n+5); 327 double vt = 0; 328 double vu = 0; 329 double v1 = 0; 330 double v2 = 0; 331 // all correction terms above are zero in absence of ties 332 bool x_have_ties = x_ties_.have_ties(); 333 bool y_have_ties = y_ties_.have_ties(); 334 if (x_have_ties || y_have_ties) { 335 if (x_have_ties) 336 vt = x_ties_.v_correction(); 337 if (y_have_ties) { 338 vu = y_ties_.v_correction(); 339 if (x_have_ties) { 340 v1 = x_ties_.n_pairs() * (y_ties_.n_pairs() / (2*n*(n-1))); 341 v2 = x_ties_.n_triples(); 134 342 if (v2) 135 v2 *= count_triple(y_count_) / (9*n*(n-1)*(n-2));343 v2 *= y_ties_.n_triples() / (9*n*(n-1)*(n-2)); 136 344 } 137 345 } 138 double v = (v0 - vt - vu)/18 + v1 + v2;139 return gsl_cdf_gaussian_Q(k, std::sqrt(v));140 346 } 141 142 143 double p_exact(bool right, bool left) const 144 { 347 return (v0 - vt - vu)/18 + v1 + v2; 348 } 349 350 351 Kendall::Pimpl::Count::Ties::Ties(void) 352 : n_pairs_(0), n_triples_(0), v_correction_(0) 353 {} 354 355 356 void Kendall::Pimpl::Count::Ties::add(size_t n) 357 { 358 unsigned long int factor = n * (n-1); 359 n_pairs_ += factor; 360 n_triples_ += factor * (n-2); 361 v_correction_ += factor * (2*n+5); 362 } 363 364 365 Kendall::Pimpl::Pimpl(void) 366 {} 367 368 369 Kendall::Pimpl::Pimpl(const Pimpl& other) 370 : data_(other.data_) 371 {} 372 373 374 Kendall::Pimpl& Kendall::Pimpl::operator=(const Pimpl& rhs) 375 { 376 data_ = rhs.data_; 377 count_.reset(); 378 return *this; 379 } 380 381 382 void Kendall::Pimpl::add(double x, double y) 383 { 384 data_.insert(std::make_pair(x, y)); 385 count_.reset(); 386 } 387 388 389 size_t Kendall::Pimpl::n(void) const 390 { 391 return data_.size(); 392 } 393 394 395 double Kendall::Pimpl::p_approx(bool right) const 396 { 397 double k = count(); 398 if (!right) 399 k = -k; 400 return gsl_cdf_gaussian_Q(k, std::sqrt(count_->variance())); 401 } 402 403 404 double Kendall::Pimpl::p_exact(bool right, bool left) const 405 { 406 assert(0); 407 return 0.0; 408 /* 145 409 long int upper = 0; 146 410 long int lower = 0; 147 if (right && left) { 148 upper = std::max(count(), -count()); 149 lower = std::min(count(), -count()); 150 } 151 else if (right && !left) { 152 upper = count(); 153 lower = std::numeric_limits<long int>::min(); 154 } 155 else if (!right && left) { 156 upper = std::numeric_limits<long int>::max(); 157 lower = count(); 411 if (right) { 412 if (left) { 413 upper = std::max(count(), -count()); 414 lower = std::min(count(), -count()); 158 415 } 159 416 else { 160 assert(false && "should not end up here"); 161 } 162 417 upper = count(); 418 lower = std::numeric_limits<long int>::min(); 419 } 420 } 421 else { 422 assert(left && "left or right must be true"); 423 upper = std::numeric_limits<long int>::max(); 424 lower = count(); 425 } 426 */ 427 /* 163 428 std::vector<double> x(x_); 164 429 std::sort(x.begin(), x.end()); … … 166 431 unsigned int total = 0; 167 432 do { 168 169 170 171 433 long int k = statistics::count(x.begin(), x.end(), y_.begin()); 434 if (k>=upper || k<=lower) 435 ++n; 436 ++total; 172 437 } 173 438 while (std::next_permutation(x.begin(), x.end())); 174 439 return static_cast<double>(n)/static_cast<double>(total); 175 } 176 177 178 void reset(void) 179 { 180 Pimpl tmp; 181 *this = tmp; 182 } 183 184 185 double score(void) const 186 { 187 double denominator = 0.5*n()*(n()-1.0); 188 if (have_ties()) { 189 double pairs = denominator; 190 denominator = std::sqrt((pairs - 0.5*count_pair(x_count_))* 191 (pairs - 0.5*count_pair(y_count_))); 192 } 193 return count() / denominator; 194 } 195 196 private: 197 long int count(void) const 198 { 199 if (!updated_) { 200 count_ = statistics::count(x_.begin(), x_.end(), y_.begin()); 201 updated_ = true; 202 } 203 return count_; 204 } 205 206 bool have_ties(void) const 207 { 208 return (x_.size() != x_count_.size()) || (y_.size() != y_count_.size()); 209 } 210 211 212 /* 213 \return \sum x * (x-1) * (2*x+5) where x is second in map \a count 214 */ 215 unsigned long int v_correction(const std::map<double, size_t>& count) const 216 { 217 unsigned long int result = 0; 218 std::map<double, size_t>::const_iterator end=count.end(); 219 for (std::map<double, size_t>::const_iterator i=count.begin();i!=end;++i) 220 result += i->second * (i->second - 1) * (2*i->second+5); 221 return result; 222 } 223 224 225 /* 226 \return \sum x * (x-1) where x is second in map \a count 227 */ 228 unsigned long int count_pair(const std::map<double, size_t>& count) const 229 { 230 unsigned long int result = 0; 231 typedef std::map<double, size_t>::const_iterator iterator; 232 iterator end=count.end(); 233 for (iterator i=count.begin(); i!=end;++i) 234 result += i->second * (i->second - 1); 235 return result; 236 } 237 238 /* 239 \return \sum x * (x-1) * (x-2) where x is second in map \a count 240 */ 241 unsigned long int count_triple(const std::map<double, size_t>& count) const 242 { 243 unsigned long int result = 0; 244 std::map<double, size_t>::const_iterator end=count.end(); 245 for (std::map<double, size_t>::const_iterator i=count.begin();i!=end;++i) 246 result += i->second * (i->second - 1) * (i->second-2); 247 return result; 248 } 249 250 std::vector<double> x_; 251 std::vector<double> y_; 252 std::map<double, size_t> x_count_; 253 std::map<double, size_t> y_count_; 254 mutable long int count_; 255 mutable bool updated_; 256 }; 257 258 259 // Kendall class 260 Kendall::Kendall(void) 261 : pimpl_(new Pimpl) 262 { 263 } 264 265 266 Kendall::Kendall(const Kendall& rhs) 267 : pimpl_(new Pimpl(*rhs.pimpl_)) 268 { 269 } 270 271 272 Kendall::~Kendall(void) 273 { 274 delete pimpl_; 275 } 276 277 278 void Kendall::add(double x, double y) 279 { 280 pimpl_->add(x, y); 281 } 282 283 284 size_t Kendall::n(void) const 285 { 286 return pimpl_->n(); 287 } 288 289 290 double Kendall::score(void) const 291 { 292 return pimpl_->score(); 293 } 294 295 296 double Kendall::p_left(bool exact) const 297 { 298 if (!exact) 299 return pimpl_->p_approx(false); 300 return pimpl_->p_exact(false, true); 301 } 302 303 304 double Kendall::p_right(bool exact) const 305 { 306 if (!exact) 307 return pimpl_->p_approx(true); 308 return pimpl_->p_exact(true, false); 309 } 310 311 312 double Kendall::p_value(bool exact) const 313 { 314 if (exact) 315 return pimpl_->p_exact(true, true); 316 if (score()>0.0) 317 return 2*p_right(false); 318 return 2*p_left(false); 319 } 320 321 322 void Kendall::reset(void) 323 { 324 pimpl_->reset(); 325 } 326 327 328 Kendall& Kendall::operator=(const Kendall& rhs) 329 { 330 if (&rhs == this) 331 return *this; 332 333 assert(pimpl_); 334 assert(rhs.pimpl_); 335 *pimpl_ = *rhs.pimpl_; 336 return *this; 440 */ 441 return 0; 442 } 443 444 445 void Kendall::Pimpl::reset(void) 446 { 447 Pimpl tmp; 448 *this = tmp; 449 } 450 451 452 double Kendall::Pimpl::score(void) const 453 { 454 count(); 455 assert(count_.get()); 456 return count_->score(); 457 } 458 459 460 long int Kendall::Pimpl::count(void) const 461 { 462 if (!count_) 463 // const_cast to allow lazy eval is more restrictive than 464 // making count_ mutable. 465 const_cast<Pimpl*>(this)->count_.reset(new Count(data_)); 466 return count_->count(); 337 467 } 338 468 -
branches/kendall-score/yat/utility/Makefile.am
r3999 r4037 2 2 3 3 # Copyright (C) 2005, 2006, 2007, 2008 Jari Häkkinen, Peter Johansson 4 # Copyright (C) 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017 , 2020Peter Johansson4 # Copyright (C) 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017 Peter Johansson 5 5 # 6 6 # This file is part of the yat library, http://dev.thep.lu.se/yat … … 44 44 yat/utility/OptionSwitch.cc \ 45 45 yat/utility/PCA.cc \ 46 yat/utility/Ranking.cc \ 46 47 yat/utility/split.cc \ 47 48 yat/utility/stl_utility.cc \ … … 121 122 $(srcdir)/yat/utility/StreamRedirect.h \ 122 123 $(srcdir)/yat/utility/Range.h \ 124 $(srcdir)/yat/utility/Ranking.h \ 123 125 $(srcdir)/yat/utility/stl_utility.h \ 124 126 $(srcdir)/yat/utility/StrideIterator.h \
Note: See TracChangeset
for help on using the changeset viewer.