source: trunk/yat/classifier/SubsetGenerator.h @ 1072

Last change on this file since 1072 was 1072, checked in by Peter, 16 years ago

fixes #309

  • Property svn:eol-style set to native
  • Property svn:keywords set to Id
File size: 11.0 KB
Line 
1#ifndef _theplu_yat_classifier_subset_generator_
2#define _theplu_yat_classifier_subset_generator_
3
4// $Id: SubsetGenerator.h 1072 2008-02-12 00:22:27Z peter $
5
6/*
7  Copyright (C) 2006 Jari Häkkinen, Markus Ringnér, Peter Johansson
8  Copyright (C) 2007 Peter Johansson
9
10  This file is part of the yat library, http://trac.thep.lu.se/yat
11
12  The yat library is free software; you can redistribute it and/or
13  modify it under the terms of the GNU General Public License as
14  published by the Free Software Foundation; either version 2 of the
15  License, or (at your option) any later version.
16
17  The yat library is distributed in the hope that it will be useful,
18  but WITHOUT ANY WARRANTY; without even the implied warranty of
19  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20  General Public License for more details.
21
22  You should have received a copy of the GNU General Public License
23  along with this program; if not, write to the Free Software
24  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
25  02111-1307, USA.
26*/
27
28#include "DataLookup2D.h"
29#include "FeatureSelector.h"
30#include "KernelLookup.h"
31#include "MatrixLookup.h"
32#include "MatrixLookupWeighted.h"
33#include "Target.h"
34#include "Sampler.h"
35
36
37#include <algorithm>
38#include <cassert>
39#include <utility>
40#include <typeinfo>
41#include <vector>
42
43namespace theplu {
44namespace yat {
45namespace classifier { 
46
47  ///
48  /// @brief Class splitting a set into training set and validation set.
49  ///
50  template <typename T> 
51  class SubsetGenerator
52  {
53
54  public:
55    ///
56    /// @brief Constructor
57    /// 
58    /// @param sampler sampler
59    /// @param data data to split up in validation and training.
60    ///
61    SubsetGenerator(const Sampler& sampler, const T& data);
62
63
64    ///
65    /// @brief Constructor
66    /// 
67    /// @param sampler taking care of partioning dataset
68    /// @param data data to be split up in validation and training.
69    /// @param fs Object selecting features for each subset
70    ///
71    SubsetGenerator(const Sampler& sampler, const T& data, 
72                    FeatureSelector& fs);
73
74    ///
75    /// Destructor
76    ///
77    ~SubsetGenerator();
78 
79    ///
80    /// @return number of subsets
81    ///
82    u_long size(void) const;
83
84    ///
85    /// @return the target for the total set
86    ///
87    const Target& target(void) const;
88
89    ///
90    /// @return the sampler for the total set
91    ///
92    //    const Sampler& sampler(void) const;
93
94    ///
95    /// @return training data
96    ///
97    const T& training_data(size_t i) const;
98
99    ///
100    /// @return training features
101    ///
102    const std::vector<size_t>&
103    training_features(std::vector<size_t>::size_type i) const;
104
105    ///
106    /// @return training index
107    ///
108    const std::vector<size_t>&
109    training_index(std::vector<size_t>::size_type i) const;
110
111    ///
112    /// @return training target
113    ///
114    const Target& training_target(std::vector<Target>::size_type i) const;
115
116    ///
117    /// @return validation data
118    ///
119    const T& validation_data(size_t i) const;
120
121    ///
122    /// @return validation index
123    ///
124    const std::vector<size_t>&
125    validation_index(std::vector<size_t>::size_type i) const;
126
127    ///
128    /// @return validation target
129    ///
130    const Target& validation_target(std::vector<Target>::size_type i) const;
131
132    ///
133    /// @return true if weighted
134    /// @todo remove this function
135    //bool weighted(void) const;
136
137  private:
138    SubsetGenerator(const SubsetGenerator&);
139    const SubsetGenerator& operator=(const SubsetGenerator&) const;
140
141    FeatureSelector* f_selector_;
142    std::vector<std::vector<size_t> > features_;
143    const Sampler& sampler_;
144    std::vector<const T*> training_data_;
145    std::vector<Target> training_target_;
146    std::vector<const T*> validation_data_;
147    std::vector<Target> validation_target_;
148    const bool weighted_;
149
150  };
151
152
153  // templates
154
155  template<typename T>
156  SubsetGenerator<T>::SubsetGenerator(const Sampler& sampler, 
157                                   const T& data)
158    : f_selector_(NULL), sampler_(sampler), weighted_(false)
159  { 
160    assert(target().size()==data.columns());
161
162    training_data_.reserve(sampler_.size());
163    validation_data_.reserve(sampler_.size());
164    for (size_t i=0; i<sampler_.size(); ++i){
165      // Dynamically allocated. Must be deleted in destructor.
166      training_data_.push_back(data.training_data(sampler.training_index(i)));
167      validation_data_.push_back(data.validation_data(sampler.training_index(i),
168                                                      sampler.validation_index(i)));
169
170      training_target_.push_back(Target(target(),sampler.training_index(i)));
171      validation_target_.push_back(Target(target(),
172                                          sampler.validation_index(i)));
173      assert(training_data_.size()==i+1);
174      assert(training_target_.size()==i+1);
175      assert(validation_data_.size()==i+1);
176      assert(validation_target_.size()==i+1);
177    }
178
179    // No feature selection, hence features same for all partitions
180    // and can be stored in features_[0]
181    features_.resize(1);
182    features_[0].reserve(data.rows());
183    for (size_t i=0; i<data.rows(); ++i)
184      features_[0].push_back(i);
185
186    assert(training_data_.size()==size());
187    assert(training_target_.size()==size());
188    assert(validation_data_.size()==size());
189    assert(validation_target_.size()==size());
190  }
191
192
193  template<typename T>
194  SubsetGenerator<T>::SubsetGenerator(const Sampler& sampler, 
195                                   const T& data, 
196                                   FeatureSelector& fs)
197    : f_selector_(&fs), sampler_(sampler), weighted_(false)
198  { 
199    assert(target().size()==data.columns());
200
201    features_.reserve(size());
202    training_data_.reserve(size());
203    validation_data_.reserve(size());
204
205    // Taking care of three different case.
206    // We start with the case of MatrixLookup
207    const MatrixLookup* ml = dynamic_cast<const MatrixLookup*>(&data);
208    if (ml){
209      for (size_t k=0; k<size(); k++){
210     
211        training_target_.push_back(Target(target(),training_index(k)));
212        validation_target_.push_back(Target(target(),validation_index(k)));
213        // training data with no feature selection
214        const MatrixLookup* train_data_all_feat = 
215          ml->training_data(training_index(k));
216        // use these data to create feature selection
217        assert(train_data_all_feat);
218        f_selector_->update(*train_data_all_feat, training_target(k));
219        // get features
220        features_.push_back(f_selector_->features());
221        assert(train_data_all_feat);
222        delete train_data_all_feat;
223       
224        // Dynamically allocated. Must be deleted in destructor.
225        training_data_.push_back(new MatrixLookup(*ml,features_.back(), 
226                                                  training_index(k)));
227        validation_data_.push_back(new MatrixLookup(*ml,features_.back(), 
228                                                    validation_index(k)));     
229      }
230    }
231    else {
232      // Second the case of MatrixLookupWeighted
233      const MatrixLookupWeighted* ml = 
234        dynamic_cast<const MatrixLookupWeighted*>(&data);
235      if (ml){       
236        for (u_long k=0; k<size(); k++){
237          training_target_.push_back(Target(target(),training_index(k)));
238          validation_target_.push_back(Target(target(),validation_index(k)));
239          // training data with no feature selection
240          const MatrixLookupWeighted* train_data_all_feat = 
241            ml->training_data(training_index(k));
242          // use these data to create feature selection
243          f_selector_->update(*train_data_all_feat, training_target(k));
244          // get features
245          features_.push_back(f_selector_->features());
246          delete train_data_all_feat;
247         
248          // Dynamically allocated. Must be deleted in destructor.
249          training_data_.push_back(new MatrixLookupWeighted(*ml,
250                                                            features_.back(), 
251                                                            training_index(k)
252                                                            ));
253          validation_data_.push_back(new MatrixLookupWeighted(*ml,
254                                                              features_.back(), 
255                                                              validation_index(k)
256                                                              ));     
257        }
258      }
259      else {
260        // Third the case of MatrixLookupWeighted
261        const KernelLookup* kernel = dynamic_cast<const KernelLookup*>(&data);
262        if (kernel){
263          for (u_long k=0; k<size(); k++){
264            training_target_.push_back(Target(target(),training_index(k)));
265            validation_target_.push_back(Target(target(),validation_index(k)));
266            const T* matrix = kernel->data();
267            // dynamically allocated must be deleted
268            const T* training_matrix = 
269              matrix->training_data(training_index(k));
270            if (matrix->weighted()){
271              const MatrixLookupWeighted& ml = 
272                dynamic_cast<const MatrixLookupWeighted&>(*matrix);
273              f_selector_->update(MatrixLookupWeighted(ml,training_index(k),false), 
274                                  training_target(k));
275            }
276            else {
277              const MatrixLookup& ml = 
278                dynamic_cast<const MatrixLookup&>(*matrix);
279              f_selector_->update(MatrixLookup(ml,training_index(k), false), 
280                                  training_target(k));
281            } 
282            std::vector<size_t> dummie=f_selector_->features();
283            features_.push_back(dummie);
284            //features_.push_back(f_selector_->features());
285            assert(kernel);
286            const KernelLookup* kl = kernel->selected(features_.back());
287            assert(training_matrix);
288            delete training_matrix;
289                     
290            // Dynamically allocated. Must be deleted in destructor.
291            training_data_.push_back(kl->training_data(training_index(k)));
292            validation_data_.push_back(kl->validation_data(training_index(k), 
293                                                           validation_index(k)));
294            assert(kl);
295            delete kl;
296          }
297        }
298        else {
299        std::cerr << "Sorry, your type of T (" 
300                  << typeid(data).name() << ")\nis not supported in " 
301                  << "SubsetGenerator with\nFeatureSelection\n";
302        exit(-1);
303        }
304      }
305    }
306    assert(training_data_.size()==size());
307    assert(training_target_.size()==size());
308    assert(validation_data_.size()==size());
309    assert(validation_target_.size()==size());
310  }
311
312
313  template<typename T>
314  SubsetGenerator<T>::~SubsetGenerator()
315  {
316    assert(training_data_.size()==validation_data_.size());
317    for (size_t i=0; i<training_data_.size(); i++) 
318      delete training_data_[i];
319    for (size_t i=0; i<validation_data_.size(); i++) 
320      delete validation_data_[i];
321  }
322
323
324  template<typename T>
325  u_long SubsetGenerator<T>::size(void) const
326  {
327    return sampler_.size();
328  }
329
330
331  template<typename T>
332  const Target& SubsetGenerator<T>::target(void) const
333  {
334    return sampler_.target();
335  }
336
337
338  template<typename T>
339  const T&
340  SubsetGenerator<T>::training_data(size_t i) const 
341  {
342    return *(training_data_[i]);
343  }
344
345
346  template<typename T>
347  const std::vector<size_t>&
348  SubsetGenerator<T>::training_features(typename std::vector<size_t>::size_type i) const
349  {
350    return f_selector_ ? features_[i] : features_[0];
351  }
352
353
354  template<typename T>
355  const std::vector<size_t>&
356  SubsetGenerator<T>::training_index(std::vector<size_t>::size_type i) const
357  {
358    return sampler_.training_index(i);
359  }
360
361
362  template<typename T>
363  const Target&
364  SubsetGenerator<T>::training_target(std::vector<Target>::size_type i) const
365  {
366    return training_target_[i];
367  }
368
369
370  template<typename T>
371  const T&
372  SubsetGenerator<T>::validation_data(size_t i) const
373  {
374    return *(validation_data_[i]);
375  }
376
377
378  template<typename T>
379  const std::vector<size_t>&
380  SubsetGenerator<T>::validation_index(std::vector<size_t>::size_type i) const
381  {
382    return sampler_.validation_index(i);
383  }
384
385
386  template<typename T>
387  const Target&
388  SubsetGenerator<T>::validation_target(std::vector<Target>::size_type i) const
389  {
390    return validation_target_[i];
391  }
392
393}}} // of namespace classifier, yat, and theplu
394
395#endif
396
Note: See TracBrowser for help on using the repository browser.