source: branches/0.4-stable/test/subset_generator_test.cc @ 1743

Last change on this file since 1743 was 1743, checked in by Peter, 12 years ago

updating copyright statements

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 11.3 KB
Line 
1// $Id: subset_generator_test.cc 1743 2009-01-23 14:20:30Z peter $
2
3/*
4  Copyright (C) 2006 Jari Häkkinen, Peter Johansson, Markus Ringnér
5  Copyright (C) 2007, 2008 Jari Häkkinen, Peter Johansson
6
7  This file is part of the yat library, http://dev.thep.lu.se/yat
8
9  The yat library is free software; you can redistribute it and/or
10  modify it under the terms of the GNU General Public License as
11  published by the Free Software Foundation; either version 2 of the
12  License, or (at your option) any later version.
13
14  The yat library is distributed in the hope that it will be useful,
15  but WITHOUT ANY WARRANTY; without even the implied warranty of
16  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17  General Public License for more details.
18
19  You should have received a copy of the GNU General Public License
20  along with this program; if not, write to the Free Software
21  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
22  02111-1307, USA.
23*/
24
25#include "Suite.h"
26
27#include "yat/classifier/BootstrapSampler.h"
28#include "yat/classifier/CrossValidationSampler.h"
29#include "yat/classifier/FeatureSelectorIR.h"
30#include "yat/classifier/Kernel_SEV.h"
31#include "yat/classifier/KernelLookup.h"
32#include "yat/classifier/MatrixLookup.h"
33#include "yat/classifier/PolynomialKernelFunction.h"
34#include "yat/classifier/SubsetGenerator.h"
35#include "yat/statistics/AUC.h"
36#include "yat/utility/Matrix.h"
37
38#include <cassert>
39#include <fstream>
40#include <iostream>
41#include <string>
42
43using namespace theplu::yat;
44
45bool class_count_test(const std::vector<size_t>&, test::Suite&);
46bool sample_count_test(const std::vector<size_t>&, test::Suite&);
47bool test_nested(test::Suite&);
48bool test_cv(test::Suite&);
49bool test_creation(test::Suite&);
50bool test_bootstrap(test::Suite&);
51
52
53int main(int argc, char* argv[])
54{ 
55  test::Suite suite(argc, argv);
56  suite.err() << "testing subset_generator" << std::endl;
57
58  test_creation(suite);
59  test_nested(suite);
60  test_cv(suite);
61
62  return suite.return_value();
63}
64
65
66bool test_creation(test::Suite& suite)
67{
68  bool ok=true;
69  std::ifstream is(test::filename("data/nm_target_bin.txt").c_str());
70  suite.err() << "loading target " << std::endl;
71  classifier::Target target(is);
72  is.close();
73  suite.err() << "number of targets: " << target.size() << std::endl;
74  suite.err() << "number of classes: " << target.nof_classes() << std::endl;
75  is.open(test::filename("data/nm_data_centralized.txt").c_str());
76  suite.err() << "loading data " << std::endl;
77  utility::Matrix m(is);
78  is.close();
79  classifier::MatrixLookup data(m);
80  suite.err() << "number of samples: " << data.columns() << std::endl;
81  suite.err() << "number of features: " << data.rows() << std::endl;
82  assert(data.columns()==target.size());
83
84  suite.err() << "building kernel" << std::endl;
85  classifier::PolynomialKernelFunction kf(1);
86  classifier::Kernel_SEV kernel_core(data,kf);
87  classifier::KernelLookup kernel(kernel_core);
88  suite.err() << "building Sampler" << std::endl;
89  classifier::CrossValidationSampler sampler(target, 30, 3);
90
91  statistics::AUC score;
92  classifier::FeatureSelectorIR fs(score, 96, 0);
93  suite.err() << "building SubsetGenerator" << std::endl;
94  classifier::SubsetGenerator<classifier::MatrixLookup> 
95    subset_data(sampler, data, fs);
96  classifier::SubsetGenerator<classifier::KernelLookup> 
97    subset_kernel(sampler, kernel,fs);
98  return ok;
99}
100
101bool test_nested(test::Suite& suite)
102{
103  bool ok=true;
104  //
105  // Test two nested CrossSplitters
106  //
107
108  suite.err() << "\ntesting two nested crossplitters" << std::endl;
109  std::vector<std::string> label(9);
110  label[0]=label[1]=label[2]="0";
111  label[3]=label[4]=label[5]="1";
112  label[6]=label[7]=label[8]="2";
113                 
114  classifier::Target target(label);
115  utility::Matrix raw_data2(2,9);
116  for(size_t i=0;i<raw_data2.rows();i++)
117    for(size_t j=0;j<raw_data2.columns();j++)
118      raw_data2(i,j)=i*10+10+j+1;
119   
120  classifier::MatrixLookup data2(raw_data2);
121  classifier::CrossValidationSampler cv2(target,3,3);
122  classifier::SubsetGenerator<classifier::MatrixLookup> cv_test(cv2,data2);
123
124  std::vector<size_t> sample_count(10,0);
125  std::vector<size_t> test_sample_count(9,0);
126  std::vector<size_t> test_class_count(3,0);
127  std::vector<double> test_value1(4,0);
128  std::vector<double> test_value2(4,0);
129  std::vector<double> t_value(4,0);
130  std::vector<double> v_value(4,0); 
131  for(unsigned long k=0;k<cv_test.size();k++) {
132   
133    const classifier::MatrixLookup& tv_view=cv_test.training_data(k);
134    const classifier::Target& tv_target=cv_test.training_target(k);
135    const utility::Index& tv_index=cv_test.training_index(k);
136    const classifier::MatrixLookup& test_view=cv_test.validation_data(k);
137    const classifier::Target& test_target=cv_test.validation_target(k);
138    const utility::Index& test_index=cv_test.validation_index(k);
139
140    for (size_t i=0; i<test_index.size(); i++) {
141      assert(test_index[i]<sample_count.size());
142      test_sample_count[test_index[i]]++;
143      test_class_count[target(test_index[i])]++;
144      test_value1[0]+=test_view(0,i);
145      test_value2[0]+=test_view(1,i);
146      test_value1[test_target(i)+1]+=test_view(0,i);
147      test_value2[test_target(i)+1]+=test_view(1,i);
148      if(test_target(i)!=target(test_index[i])) {
149        ok=false;
150        suite.err() << "ERROR: incorrect mapping of test indices" << std:: endl;
151      }       
152    }
153   
154    classifier::CrossValidationSampler sampler_training(tv_target,2,2);
155    classifier::SubsetGenerator<classifier::MatrixLookup> 
156      cv_training(sampler_training,tv_view);
157    std::vector<size_t> v_sample_count(6,0);
158    std::vector<size_t> t_sample_count(6,0);
159    std::vector<size_t> v_class_count(3,0);
160    std::vector<size_t> t_class_count(3,0);
161    std::vector<size_t> t_class_count2(3,0);
162    for(unsigned long l=0;l<cv_training.size();l++) {
163      const classifier::MatrixLookup& t_view=cv_training.training_data(l);
164      const classifier::Target& t_target=cv_training.training_target(l);
165      const utility::Index& t_index=cv_training.training_index(l);
166      const classifier::MatrixLookup& v_view=cv_training.validation_data(l);
167      const classifier::Target& v_target=cv_training.validation_target(l);
168      const utility::Index& v_index=cv_training.validation_index(l);
169     
170      if (test_index.size()+tv_index.size()!=target.size() 
171          || t_index.size()+v_index.size() != tv_target.size() 
172          || test_index.size()+v_index.size()+t_index.size() !=  target.size()){
173        ok = false;
174        suite.err() << "ERROR: size of training samples, validation samples " 
175               << "and test samples in is invalid." 
176               << std::endl;
177      }
178      if (test_index.size()!=3 || tv_index.size()!=6 || t_index.size()!=3 ||
179          v_index.size()!=3){
180        ok = false;
181        suite.err() << "ERROR: size of training, validation, and test samples"
182               << " is invalid." 
183               << " Expected sizes to be 3" << std::endl;
184      }     
185
186      std::vector<size_t> tv_sample_count(6,0);
187      for (size_t i=0; i<t_index.size(); i++) {
188        assert(t_index[i]<t_sample_count.size());
189        tv_sample_count[t_index[i]]++;
190        t_sample_count[t_index[i]]++;
191        t_class_count[t_target(i)]++;
192        t_class_count2[tv_target(t_index[i])]++;
193        t_value[0]+=t_view(0,i);
194        t_value[t_target(i)+1]+=t_view(0,i);       
195      }
196      for (size_t i=0; i<v_index.size(); i++) {
197        assert(v_index[i]<v_sample_count.size());
198        tv_sample_count[v_index[i]]++;
199        v_sample_count[v_index[i]]++;
200        v_class_count[v_target(i)]++;
201        v_value[0]+=v_view(0,i);
202        v_value[v_target(i)+1]+=v_view(0,i);
203      }
204 
205      ok = ok && sample_count_test(tv_sample_count,suite);     
206
207    }
208    ok = ok && sample_count_test(v_sample_count,suite);
209    ok = ok && sample_count_test(t_sample_count,suite);
210   
211    ok = ok && class_count_test(t_class_count,suite);
212    ok = ok && class_count_test(t_class_count2,suite);
213    ok = ok && class_count_test(v_class_count,suite);
214
215
216  }
217  ok = ok && sample_count_test(test_sample_count,suite);
218  ok = ok && class_count_test(test_class_count,suite);
219 
220  if(test_value1[0]!=135 || test_value1[1]!=36 || test_value1[2]!=45 ||
221     test_value1[3]!=54) {
222    ok=false;
223    suite.err() << "ERROR: incorrect sums of test values in row 1" 
224           << " found: " << test_value1[0] << ", "  << test_value1[1] 
225           << ", "  << test_value1[2] << " and "  << test_value1[3] 
226           << std::endl;
227  }
228
229 
230  if(test_value2[0]!=225 || test_value2[1]!=66 || test_value2[2]!=75 ||
231     test_value2[3]!=84) {
232    ok=false;
233    suite.err() << "ERROR: incorrect sums of test values in row 2" 
234           << " found: " << test_value2[0] << ", "  << test_value2[1] 
235           << ", "  << test_value2[2] << " and "  << test_value2[3] 
236           << std::endl;
237  }
238
239  if(t_value[0]!=270 || t_value[1]!=72 || t_value[2]!=90 || t_value[3]!=108)  {
240    ok=false;
241    suite.err() << "ERROR: incorrect sums of training values in row 1" 
242           << " found: " << t_value[0] << ", "  << t_value[1] 
243           << ", "  << t_value[2] << " and "  << t_value[3] 
244           << std::endl;   
245  }
246
247  if(v_value[0]!=270 || v_value[1]!=72 || v_value[2]!=90 || v_value[3]!=108)  {
248    ok=false;
249    suite.err() << "ERROR: incorrect sums of validation values in row 1" 
250           << " found: " << v_value[0] << ", "  << v_value[1] 
251           << ", "  << v_value[2] << " and "  << v_value[3] 
252           << std::endl;   
253  }
254  return ok;
255}
256
257bool class_count_test(const std::vector<size_t>& class_count, 
258                      test::Suite& suite) 
259{
260  bool ok=true;
261  for (size_t i=0; i<class_count.size(); i++)
262    if (class_count[i]==0){
263      ok = false;
264      suite.err() << "ERROR: class " << i << " was not in set." 
265             << " Expected at least one sample from each class." 
266             << std::endl;
267    }
268  return ok;
269}
270
271bool sample_count_test(const std::vector<size_t>& sample_count, 
272                       test::Suite& suite) 
273{
274  bool ok=true;
275  for (size_t i=0; i<sample_count.size(); i++){
276    if (sample_count[i]!=1){
277      ok = false;
278      suite.err() << "ERROR: sample " << i << " was in a group " << sample_count[i] 
279             << " times." << " Expected to be 1 time" << std::endl;
280    }
281  }
282  return ok;
283}
284
285
286bool test_bootstrap(test::Suite& suite)
287{
288  bool ok=true;
289  std::vector<std::string> label(10,"default");
290  label[2]=label[7]="white";
291  label[4]=label[5]="black";
292  label[6]=label[3]="green";
293  label[8]=label[9]="red";
294                 
295  classifier::Target target(label);
296  utility::Matrix raw_data(10,10);
297  classifier::MatrixLookup data(raw_data);
298  classifier::BootstrapSampler cv(target,3);
299  return ok;
300}
301
302
303bool test_cv(test::Suite& suite)
304{
305  bool ok=true;
306  std::vector<std::string> label(10,"default");
307  label[2]=label[7]="white";
308  label[4]=label[5]="black";
309  label[6]=label[3]="green";
310  label[8]=label[9]="red";
311                 
312  classifier::Target target(label);
313  utility::Matrix raw_data(10,10);
314  classifier::MatrixLookup data(raw_data);
315  classifier::CrossValidationSampler cv(target,3,3);
316 
317  std::vector<size_t> sample_count(10,0);
318  for (size_t j=0; j<cv.size(); ++j){
319    std::vector<size_t> class_count(5,0);
320    assert(j<cv.size());
321    if (cv.training_index(j).size()+cv.validation_index(j).size()!=
322        target.size()){
323      ok = false;
324      suite.err() << "ERROR: size of training samples plus " 
325             << "size of validation samples is invalid." << std::endl;
326    }
327    if (cv.validation_index(j).size()!=3 && cv.validation_index(j).size()!=4){
328      ok = false;
329      suite.err() << "ERROR: size of validation samples is invalid." 
330             << "expected size to be 3 or 4" << std::endl;
331    }
332    for (size_t i=0; i<cv.validation_index(j).size(); i++) {
333      assert(cv.validation_index(j)[i]<sample_count.size());
334      sample_count[cv.validation_index(j)[i]]++;
335    }
336    for (size_t i=0; i<cv.training_index(j).size(); i++) {
337      class_count[target(cv.training_index(j)[i])]++;
338    }
339    ok = ok && class_count_test(class_count,suite);
340  }
341  ok = ok && sample_count_test(sample_count,suite);
342 
343  return ok;
344}
Note: See TracBrowser for help on using the repository browser.