source: trunk/test/subset_generator_test.cc @ 1275

Last change on this file since 1275 was 1275, checked in by Jari Häkkinen, 13 years ago

Updating copyright statements.

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 11.2 KB
Line 
1// $Id: subset_generator_test.cc 1275 2008-04-11 06:10:12Z jari $
2
3/*
4  Copyright (C) 2006, 2007, 2008 Jari Häkkinen, Peter Johansson, Markus Ringnér
5
6  This file is part of the yat library, http://trac.thep.lu.se/yat
7
8  The yat library is free software; you can redistribute it and/or
9  modify it under the terms of the GNU General Public License as
10  published by the Free Software Foundation; either version 2 of the
11  License, or (at your option) any later version.
12
13  The yat library is distributed in the hope that it will be useful,
14  but WITHOUT ANY WARRANTY; without even the implied warranty of
15  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16  General Public License for more details.
17
18  You should have received a copy of the GNU General Public License
19  along with this program; if not, write to the Free Software
20  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
21  02111-1307, USA.
22*/
23
24#include "Suite.h"
25
26#include "yat/classifier/BootstrapSampler.h"
27#include "yat/classifier/CrossValidationSampler.h"
28#include "yat/classifier/FeatureSelectorIR.h"
29#include "yat/classifier/Kernel_SEV.h"
30#include "yat/classifier/KernelLookup.h"
31#include "yat/classifier/MatrixLookup.h"
32#include "yat/classifier/PolynomialKernelFunction.h"
33#include "yat/classifier/SubsetGenerator.h"
34#include "yat/statistics/AUC.h"
35#include "yat/utility/Matrix.h"
36
37#include <cassert>
38#include <fstream>
39#include <iostream>
40#include <string>
41
42using namespace theplu::yat;
43
44bool class_count_test(const std::vector<size_t>&, test::Suite&);
45bool sample_count_test(const std::vector<size_t>&, test::Suite&);
46bool test_nested(test::Suite&);
47bool test_cv(test::Suite&);
48bool test_creation(test::Suite&);
49bool test_bootstrap(test::Suite&);
50
51
52int main(int argc, char* argv[])
53{ 
54  test::Suite suite(argc, argv);
55  suite.err() << "testing subset_generator" << std::endl;
56
57  test_creation(suite);
58  test_nested(suite);
59  test_cv(suite);
60
61  return suite.return_value();
62}
63
64
65bool test_creation(test::Suite& suite)
66{
67  bool ok=true;
68  std::ifstream is(test::filename("data/nm_target_bin.txt").c_str());
69  suite.err() << "loading target " << std::endl;
70  classifier::Target target(is);
71  is.close();
72  suite.err() << "number of targets: " << target.size() << std::endl;
73  suite.err() << "number of classes: " << target.nof_classes() << std::endl;
74  is.open(test::filename("data/nm_data_centralized.txt").c_str());
75  suite.err() << "loading data " << std::endl;
76  utility::Matrix m(is);
77  is.close();
78  classifier::MatrixLookup data(m);
79  suite.err() << "number of samples: " << data.columns() << std::endl;
80  suite.err() << "number of features: " << data.rows() << std::endl;
81  assert(data.columns()==target.size());
82
83  suite.err() << "building kernel" << std::endl;
84  classifier::PolynomialKernelFunction kf(1);
85  classifier::Kernel_SEV kernel_core(data,kf);
86  classifier::KernelLookup kernel(kernel_core);
87  suite.err() << "building Sampler" << std::endl;
88  classifier::CrossValidationSampler sampler(target, 30, 3);
89
90  statistics::AUC score;
91  classifier::FeatureSelectorIR fs(score, 96, 0);
92  suite.err() << "building SubsetGenerator" << std::endl;
93  classifier::SubsetGenerator<classifier::MatrixLookup> 
94    subset_data(sampler, data, fs);
95  classifier::SubsetGenerator<classifier::KernelLookup> 
96    subset_kernel(sampler, kernel,fs);
97  return ok;
98}
99
100bool test_nested(test::Suite& suite)
101{
102  bool ok=true;
103  //
104  // Test two nested CrossSplitters
105  //
106
107  suite.err() << "\ntesting two nested crossplitters" << std::endl;
108  std::vector<std::string> label(9);
109  label[0]=label[1]=label[2]="0";
110  label[3]=label[4]=label[5]="1";
111  label[6]=label[7]=label[8]="2";
112                 
113  classifier::Target target(label);
114  utility::Matrix raw_data2(2,9);
115  for(size_t i=0;i<raw_data2.rows();i++)
116    for(size_t j=0;j<raw_data2.columns();j++)
117      raw_data2(i,j)=i*10+10+j+1;
118   
119  classifier::MatrixLookup data2(raw_data2);
120  classifier::CrossValidationSampler cv2(target,3,3);
121  classifier::SubsetGenerator<classifier::MatrixLookup> cv_test(cv2,data2);
122
123  std::vector<size_t> sample_count(10,0);
124  std::vector<size_t> test_sample_count(9,0);
125  std::vector<size_t> test_class_count(3,0);
126  std::vector<double> test_value1(4,0);
127  std::vector<double> test_value2(4,0);
128  std::vector<double> t_value(4,0);
129  std::vector<double> v_value(4,0); 
130  for(unsigned long k=0;k<cv_test.size();k++) {
131   
132    const classifier::MatrixLookup& tv_view=cv_test.training_data(k);
133    const classifier::Target& tv_target=cv_test.training_target(k);
134    const utility::Index& tv_index=cv_test.training_index(k);
135    const classifier::MatrixLookup& test_view=cv_test.validation_data(k);
136    const classifier::Target& test_target=cv_test.validation_target(k);
137    const utility::Index& test_index=cv_test.validation_index(k);
138
139    for (size_t i=0; i<test_index.size(); i++) {
140      assert(test_index[i]<sample_count.size());
141      test_sample_count[test_index[i]]++;
142      test_class_count[target(test_index[i])]++;
143      test_value1[0]+=test_view(0,i);
144      test_value2[0]+=test_view(1,i);
145      test_value1[test_target(i)+1]+=test_view(0,i);
146      test_value2[test_target(i)+1]+=test_view(1,i);
147      if(test_target(i)!=target(test_index[i])) {
148        ok=false;
149        suite.err() << "ERROR: incorrect mapping of test indices" << std:: endl;
150      }       
151    }
152   
153    classifier::CrossValidationSampler sampler_training(tv_target,2,2);
154    classifier::SubsetGenerator<classifier::MatrixLookup> 
155      cv_training(sampler_training,tv_view);
156    std::vector<size_t> v_sample_count(6,0);
157    std::vector<size_t> t_sample_count(6,0);
158    std::vector<size_t> v_class_count(3,0);
159    std::vector<size_t> t_class_count(3,0);
160    std::vector<size_t> t_class_count2(3,0);
161    for(unsigned long l=0;l<cv_training.size();l++) {
162      const classifier::MatrixLookup& t_view=cv_training.training_data(l);
163      const classifier::Target& t_target=cv_training.training_target(l);
164      const utility::Index& t_index=cv_training.training_index(l);
165      const classifier::MatrixLookup& v_view=cv_training.validation_data(l);
166      const classifier::Target& v_target=cv_training.validation_target(l);
167      const utility::Index& v_index=cv_training.validation_index(l);
168     
169      if (test_index.size()+tv_index.size()!=target.size() 
170          || t_index.size()+v_index.size() != tv_target.size() 
171          || test_index.size()+v_index.size()+t_index.size() !=  target.size()){
172        ok = false;
173        suite.err() << "ERROR: size of training samples, validation samples " 
174               << "and test samples in is invalid." 
175               << std::endl;
176      }
177      if (test_index.size()!=3 || tv_index.size()!=6 || t_index.size()!=3 ||
178          v_index.size()!=3){
179        ok = false;
180        suite.err() << "ERROR: size of training, validation, and test samples"
181               << " is invalid." 
182               << " Expected sizes to be 3" << std::endl;
183      }     
184
185      std::vector<size_t> tv_sample_count(6,0);
186      for (size_t i=0; i<t_index.size(); i++) {
187        assert(t_index[i]<t_sample_count.size());
188        tv_sample_count[t_index[i]]++;
189        t_sample_count[t_index[i]]++;
190        t_class_count[t_target(i)]++;
191        t_class_count2[tv_target(t_index[i])]++;
192        t_value[0]+=t_view(0,i);
193        t_value[t_target(i)+1]+=t_view(0,i);       
194      }
195      for (size_t i=0; i<v_index.size(); i++) {
196        assert(v_index[i]<v_sample_count.size());
197        tv_sample_count[v_index[i]]++;
198        v_sample_count[v_index[i]]++;
199        v_class_count[v_target(i)]++;
200        v_value[0]+=v_view(0,i);
201        v_value[v_target(i)+1]+=v_view(0,i);
202      }
203 
204      ok = ok && sample_count_test(tv_sample_count,suite);     
205
206    }
207    ok = ok && sample_count_test(v_sample_count,suite);
208    ok = ok && sample_count_test(t_sample_count,suite);
209   
210    ok = ok && class_count_test(t_class_count,suite);
211    ok = ok && class_count_test(t_class_count2,suite);
212    ok = ok && class_count_test(v_class_count,suite);
213
214
215  }
216  ok = ok && sample_count_test(test_sample_count,suite);
217  ok = ok && class_count_test(test_class_count,suite);
218 
219  if(test_value1[0]!=135 || test_value1[1]!=36 || test_value1[2]!=45 ||
220     test_value1[3]!=54) {
221    ok=false;
222    suite.err() << "ERROR: incorrect sums of test values in row 1" 
223           << " found: " << test_value1[0] << ", "  << test_value1[1] 
224           << ", "  << test_value1[2] << " and "  << test_value1[3] 
225           << std::endl;
226  }
227
228 
229  if(test_value2[0]!=225 || test_value2[1]!=66 || test_value2[2]!=75 ||
230     test_value2[3]!=84) {
231    ok=false;
232    suite.err() << "ERROR: incorrect sums of test values in row 2" 
233           << " found: " << test_value2[0] << ", "  << test_value2[1] 
234           << ", "  << test_value2[2] << " and "  << test_value2[3] 
235           << std::endl;
236  }
237
238  if(t_value[0]!=270 || t_value[1]!=72 || t_value[2]!=90 || t_value[3]!=108)  {
239    ok=false;
240    suite.err() << "ERROR: incorrect sums of training values in row 1" 
241           << " found: " << t_value[0] << ", "  << t_value[1] 
242           << ", "  << t_value[2] << " and "  << t_value[3] 
243           << std::endl;   
244  }
245
246  if(v_value[0]!=270 || v_value[1]!=72 || v_value[2]!=90 || v_value[3]!=108)  {
247    ok=false;
248    suite.err() << "ERROR: incorrect sums of validation values in row 1" 
249           << " found: " << v_value[0] << ", "  << v_value[1] 
250           << ", "  << v_value[2] << " and "  << v_value[3] 
251           << std::endl;   
252  }
253  return ok;
254}
255
256bool class_count_test(const std::vector<size_t>& class_count, 
257                      test::Suite& suite) 
258{
259  bool ok=true;
260  for (size_t i=0; i<class_count.size(); i++)
261    if (class_count[i]==0){
262      ok = false;
263      suite.err() << "ERROR: class " << i << " was not in set." 
264             << " Expected at least one sample from each class." 
265             << std::endl;
266    }
267  return ok;
268}
269
270bool sample_count_test(const std::vector<size_t>& sample_count, 
271                       test::Suite& suite) 
272{
273  bool ok=true;
274  for (size_t i=0; i<sample_count.size(); i++){
275    if (sample_count[i]!=1){
276      ok = false;
277      suite.err() << "ERROR: sample " << i << " was in a group " << sample_count[i] 
278             << " times." << " Expected to be 1 time" << std::endl;
279    }
280  }
281  return ok;
282}
283
284
285bool test_bootstrap(test::Suite& suite)
286{
287  bool ok=true;
288  std::vector<std::string> label(10,"default");
289  label[2]=label[7]="white";
290  label[4]=label[5]="black";
291  label[6]=label[3]="green";
292  label[8]=label[9]="red";
293                 
294  classifier::Target target(label);
295  utility::Matrix raw_data(10,10);
296  classifier::MatrixLookup data(raw_data);
297  classifier::BootstrapSampler cv(target,3);
298  return ok;
299}
300
301
302bool test_cv(test::Suite& suite)
303{
304  bool ok=true;
305  std::vector<std::string> label(10,"default");
306  label[2]=label[7]="white";
307  label[4]=label[5]="black";
308  label[6]=label[3]="green";
309  label[8]=label[9]="red";
310                 
311  classifier::Target target(label);
312  utility::Matrix raw_data(10,10);
313  classifier::MatrixLookup data(raw_data);
314  classifier::CrossValidationSampler cv(target,3,3);
315 
316  std::vector<size_t> sample_count(10,0);
317  for (size_t j=0; j<cv.size(); ++j){
318    std::vector<size_t> class_count(5,0);
319    assert(j<cv.size());
320    if (cv.training_index(j).size()+cv.validation_index(j).size()!=
321        target.size()){
322      ok = false;
323      suite.err() << "ERROR: size of training samples plus " 
324             << "size of validation samples is invalid." << std::endl;
325    }
326    if (cv.validation_index(j).size()!=3 && cv.validation_index(j).size()!=4){
327      ok = false;
328      suite.err() << "ERROR: size of validation samples is invalid." 
329             << "expected size to be 3 or 4" << std::endl;
330    }
331    for (size_t i=0; i<cv.validation_index(j).size(); i++) {
332      assert(cv.validation_index(j)[i]<sample_count.size());
333      sample_count[cv.validation_index(j)[i]]++;
334    }
335    for (size_t i=0; i<cv.training_index(j).size(); i++) {
336      class_count[target(cv.training_index(j)[i])]++;
337    }
338    ok = ok && class_count_test(class_count,suite);
339  }
340  ok = ok && sample_count_test(sample_count,suite);
341 
342  return ok;
343}
Note: See TracBrowser for help on using the repository browser.