source: trunk/test/subset_generator_test.cc @ 1659

Last change on this file since 1659 was 1487, checked in by Jari Häkkinen, 13 years ago

Addresses #436. GPL license copy reference should also be updated.

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 11.2 KB
Line 
1// $Id: subset_generator_test.cc 1487 2008-09-10 08:41:36Z jari $
2
3/*
4  Copyright (C) 2006, 2007, 2008 Jari Häkkinen, Peter Johansson, Markus Ringnér
5
6  This file is part of the yat library, http://dev.thep.lu.se/yat
7
8  The yat library is free software; you can redistribute it and/or
9  modify it under the terms of the GNU General Public License as
10  published by the Free Software Foundation; either version 3 of the
11  License, or (at your option) any later version.
12
13  The yat library is distributed in the hope that it will be useful,
14  but WITHOUT ANY WARRANTY; without even the implied warranty of
15  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16  General Public License for more details.
17
18  You should have received a copy of the GNU General Public License
19  along with yat. If not, see <http://www.gnu.org/licenses/>.
20*/
21
22#include "Suite.h"
23
24#include "yat/classifier/BootstrapSampler.h"
25#include "yat/classifier/CrossValidationSampler.h"
26#include "yat/classifier/FeatureSelectorIR.h"
27#include "yat/classifier/Kernel_SEV.h"
28#include "yat/classifier/KernelLookup.h"
29#include "yat/classifier/MatrixLookup.h"
30#include "yat/classifier/PolynomialKernelFunction.h"
31#include "yat/classifier/SubsetGenerator.h"
32#include "yat/statistics/AUC.h"
33#include "yat/utility/Matrix.h"
34
35#include <cassert>
36#include <fstream>
37#include <iostream>
38#include <string>
39
40using namespace theplu::yat;
41
42bool class_count_test(const std::vector<size_t>&, test::Suite&);
43bool sample_count_test(const std::vector<size_t>&, test::Suite&);
44bool test_nested(test::Suite&);
45bool test_cv(test::Suite&);
46bool test_creation(test::Suite&);
47bool test_bootstrap(test::Suite&);
48
49
50int main(int argc, char* argv[])
51{ 
52  test::Suite suite(argc, argv);
53  suite.err() << "testing subset_generator" << std::endl;
54
55  test_creation(suite);
56  test_nested(suite);
57  test_cv(suite);
58
59  return suite.return_value();
60}
61
62
63bool test_creation(test::Suite& suite)
64{
65  bool ok=true;
66  std::ifstream is(test::filename("data/nm_target_bin.txt").c_str());
67  suite.err() << "loading target " << std::endl;
68  classifier::Target target(is);
69  is.close();
70  suite.err() << "number of targets: " << target.size() << std::endl;
71  suite.err() << "number of classes: " << target.nof_classes() << std::endl;
72  is.open(test::filename("data/nm_data_centralized.txt").c_str());
73  suite.err() << "loading data " << std::endl;
74  utility::Matrix m(is);
75  is.close();
76  classifier::MatrixLookup data(m);
77  suite.err() << "number of samples: " << data.columns() << std::endl;
78  suite.err() << "number of features: " << data.rows() << std::endl;
79  assert(data.columns()==target.size());
80
81  suite.err() << "building kernel" << std::endl;
82  classifier::PolynomialKernelFunction kf(1);
83  classifier::Kernel_SEV kernel_core(data,kf);
84  classifier::KernelLookup kernel(kernel_core);
85  suite.err() << "building Sampler" << std::endl;
86  classifier::CrossValidationSampler sampler(target, 30, 3);
87
88  statistics::AUC score;
89  classifier::FeatureSelectorIR fs(score, 96, 0);
90  suite.err() << "building SubsetGenerator" << std::endl;
91  classifier::SubsetGenerator<classifier::MatrixLookup> 
92    subset_data(sampler, data, fs);
93  classifier::SubsetGenerator<classifier::KernelLookup> 
94    subset_kernel(sampler, kernel,fs);
95  return ok;
96}
97
98bool test_nested(test::Suite& suite)
99{
100  bool ok=true;
101  //
102  // Test two nested CrossSplitters
103  //
104
105  suite.err() << "\ntesting two nested crossplitters" << std::endl;
106  std::vector<std::string> label(9);
107  label[0]=label[1]=label[2]="0";
108  label[3]=label[4]=label[5]="1";
109  label[6]=label[7]=label[8]="2";
110                 
111  classifier::Target target(label);
112  utility::Matrix raw_data2(2,9);
113  for(size_t i=0;i<raw_data2.rows();i++)
114    for(size_t j=0;j<raw_data2.columns();j++)
115      raw_data2(i,j)=i*10+10+j+1;
116   
117  classifier::MatrixLookup data2(raw_data2);
118  classifier::CrossValidationSampler cv2(target,3,3);
119  classifier::SubsetGenerator<classifier::MatrixLookup> cv_test(cv2,data2);
120
121  std::vector<size_t> sample_count(10,0);
122  std::vector<size_t> test_sample_count(9,0);
123  std::vector<size_t> test_class_count(3,0);
124  std::vector<double> test_value1(4,0);
125  std::vector<double> test_value2(4,0);
126  std::vector<double> t_value(4,0);
127  std::vector<double> v_value(4,0); 
128  for(unsigned long k=0;k<cv_test.size();k++) {
129   
130    const classifier::MatrixLookup& tv_view=cv_test.training_data(k);
131    const classifier::Target& tv_target=cv_test.training_target(k);
132    const utility::Index& tv_index=cv_test.training_index(k);
133    const classifier::MatrixLookup& test_view=cv_test.validation_data(k);
134    const classifier::Target& test_target=cv_test.validation_target(k);
135    const utility::Index& test_index=cv_test.validation_index(k);
136
137    for (size_t i=0; i<test_index.size(); i++) {
138      assert(test_index[i]<sample_count.size());
139      test_sample_count[test_index[i]]++;
140      test_class_count[target(test_index[i])]++;
141      test_value1[0]+=test_view(0,i);
142      test_value2[0]+=test_view(1,i);
143      test_value1[test_target(i)+1]+=test_view(0,i);
144      test_value2[test_target(i)+1]+=test_view(1,i);
145      if(test_target(i)!=target(test_index[i])) {
146        ok=false;
147        suite.err() << "ERROR: incorrect mapping of test indices" << std:: endl;
148      }       
149    }
150   
151    classifier::CrossValidationSampler sampler_training(tv_target,2,2);
152    classifier::SubsetGenerator<classifier::MatrixLookup> 
153      cv_training(sampler_training,tv_view);
154    std::vector<size_t> v_sample_count(6,0);
155    std::vector<size_t> t_sample_count(6,0);
156    std::vector<size_t> v_class_count(3,0);
157    std::vector<size_t> t_class_count(3,0);
158    std::vector<size_t> t_class_count2(3,0);
159    for(unsigned long l=0;l<cv_training.size();l++) {
160      const classifier::MatrixLookup& t_view=cv_training.training_data(l);
161      const classifier::Target& t_target=cv_training.training_target(l);
162      const utility::Index& t_index=cv_training.training_index(l);
163      const classifier::MatrixLookup& v_view=cv_training.validation_data(l);
164      const classifier::Target& v_target=cv_training.validation_target(l);
165      const utility::Index& v_index=cv_training.validation_index(l);
166     
167      if (test_index.size()+tv_index.size()!=target.size() 
168          || t_index.size()+v_index.size() != tv_target.size() 
169          || test_index.size()+v_index.size()+t_index.size() !=  target.size()){
170        ok = false;
171        suite.err() << "ERROR: size of training samples, validation samples " 
172               << "and test samples in is invalid." 
173               << std::endl;
174      }
175      if (test_index.size()!=3 || tv_index.size()!=6 || t_index.size()!=3 ||
176          v_index.size()!=3){
177        ok = false;
178        suite.err() << "ERROR: size of training, validation, and test samples"
179               << " is invalid." 
180               << " Expected sizes to be 3" << std::endl;
181      }     
182
183      std::vector<size_t> tv_sample_count(6,0);
184      for (size_t i=0; i<t_index.size(); i++) {
185        assert(t_index[i]<t_sample_count.size());
186        tv_sample_count[t_index[i]]++;
187        t_sample_count[t_index[i]]++;
188        t_class_count[t_target(i)]++;
189        t_class_count2[tv_target(t_index[i])]++;
190        t_value[0]+=t_view(0,i);
191        t_value[t_target(i)+1]+=t_view(0,i);       
192      }
193      for (size_t i=0; i<v_index.size(); i++) {
194        assert(v_index[i]<v_sample_count.size());
195        tv_sample_count[v_index[i]]++;
196        v_sample_count[v_index[i]]++;
197        v_class_count[v_target(i)]++;
198        v_value[0]+=v_view(0,i);
199        v_value[v_target(i)+1]+=v_view(0,i);
200      }
201 
202      ok = ok && sample_count_test(tv_sample_count,suite);     
203
204    }
205    ok = ok && sample_count_test(v_sample_count,suite);
206    ok = ok && sample_count_test(t_sample_count,suite);
207   
208    ok = ok && class_count_test(t_class_count,suite);
209    ok = ok && class_count_test(t_class_count2,suite);
210    ok = ok && class_count_test(v_class_count,suite);
211
212
213  }
214  ok = ok && sample_count_test(test_sample_count,suite);
215  ok = ok && class_count_test(test_class_count,suite);
216 
217  if(test_value1[0]!=135 || test_value1[1]!=36 || test_value1[2]!=45 ||
218     test_value1[3]!=54) {
219    ok=false;
220    suite.err() << "ERROR: incorrect sums of test values in row 1" 
221           << " found: " << test_value1[0] << ", "  << test_value1[1] 
222           << ", "  << test_value1[2] << " and "  << test_value1[3] 
223           << std::endl;
224  }
225
226 
227  if(test_value2[0]!=225 || test_value2[1]!=66 || test_value2[2]!=75 ||
228     test_value2[3]!=84) {
229    ok=false;
230    suite.err() << "ERROR: incorrect sums of test values in row 2" 
231           << " found: " << test_value2[0] << ", "  << test_value2[1] 
232           << ", "  << test_value2[2] << " and "  << test_value2[3] 
233           << std::endl;
234  }
235
236  if(t_value[0]!=270 || t_value[1]!=72 || t_value[2]!=90 || t_value[3]!=108)  {
237    ok=false;
238    suite.err() << "ERROR: incorrect sums of training values in row 1" 
239           << " found: " << t_value[0] << ", "  << t_value[1] 
240           << ", "  << t_value[2] << " and "  << t_value[3] 
241           << std::endl;   
242  }
243
244  if(v_value[0]!=270 || v_value[1]!=72 || v_value[2]!=90 || v_value[3]!=108)  {
245    ok=false;
246    suite.err() << "ERROR: incorrect sums of validation values in row 1" 
247           << " found: " << v_value[0] << ", "  << v_value[1] 
248           << ", "  << v_value[2] << " and "  << v_value[3] 
249           << std::endl;   
250  }
251  return ok;
252}
253
254bool class_count_test(const std::vector<size_t>& class_count, 
255                      test::Suite& suite) 
256{
257  bool ok=true;
258  for (size_t i=0; i<class_count.size(); i++)
259    if (class_count[i]==0){
260      ok = false;
261      suite.err() << "ERROR: class " << i << " was not in set." 
262             << " Expected at least one sample from each class." 
263             << std::endl;
264    }
265  return ok;
266}
267
268bool sample_count_test(const std::vector<size_t>& sample_count, 
269                       test::Suite& suite) 
270{
271  bool ok=true;
272  for (size_t i=0; i<sample_count.size(); i++){
273    if (sample_count[i]!=1){
274      ok = false;
275      suite.err() << "ERROR: sample " << i << " was in a group " << sample_count[i] 
276             << " times." << " Expected to be 1 time" << std::endl;
277    }
278  }
279  return ok;
280}
281
282
283bool test_bootstrap(test::Suite& suite)
284{
285  bool ok=true;
286  std::vector<std::string> label(10,"default");
287  label[2]=label[7]="white";
288  label[4]=label[5]="black";
289  label[6]=label[3]="green";
290  label[8]=label[9]="red";
291                 
292  classifier::Target target(label);
293  utility::Matrix raw_data(10,10);
294  classifier::MatrixLookup data(raw_data);
295  classifier::BootstrapSampler cv(target,3);
296  return ok;
297}
298
299
300bool test_cv(test::Suite& suite)
301{
302  bool ok=true;
303  std::vector<std::string> label(10,"default");
304  label[2]=label[7]="white";
305  label[4]=label[5]="black";
306  label[6]=label[3]="green";
307  label[8]=label[9]="red";
308                 
309  classifier::Target target(label);
310  utility::Matrix raw_data(10,10);
311  classifier::MatrixLookup data(raw_data);
312  classifier::CrossValidationSampler cv(target,3,3);
313 
314  std::vector<size_t> sample_count(10,0);
315  for (size_t j=0; j<cv.size(); ++j){
316    std::vector<size_t> class_count(5,0);
317    assert(j<cv.size());
318    if (cv.training_index(j).size()+cv.validation_index(j).size()!=
319        target.size()){
320      ok = false;
321      suite.err() << "ERROR: size of training samples plus " 
322             << "size of validation samples is invalid." << std::endl;
323    }
324    if (cv.validation_index(j).size()!=3 && cv.validation_index(j).size()!=4){
325      ok = false;
326      suite.err() << "ERROR: size of validation samples is invalid." 
327             << "expected size to be 3 or 4" << std::endl;
328    }
329    for (size_t i=0; i<cv.validation_index(j).size(); i++) {
330      assert(cv.validation_index(j)[i]<sample_count.size());
331      sample_count[cv.validation_index(j)[i]]++;
332    }
333    for (size_t i=0; i<cv.training_index(j).size(); i++) {
334      class_count[target(cv.training_index(j)[i])]++;
335    }
336    ok = ok && class_count_test(class_count,suite);
337  }
338  ok = ok && sample_count_test(sample_count,suite);
339 
340  return ok;
341}
Note: See TracBrowser for help on using the repository browser.