source: trunk/yat/classifier/SubsetGenerator.h @ 1221

Last change on this file since 1221 was 1220, checked in by Peter, 13 years ago

refs #341

  • Property svn:eol-style set to native
  • Property svn:keywords set to Id
File size: 13.8 KB
Line 
1#ifndef _theplu_yat_classifier_subset_generator_
2#define _theplu_yat_classifier_subset_generator_
3
4// $Id: SubsetGenerator.h 1220 2008-03-11 00:07:42Z peter $
5
6/*
7  Copyright (C) 2006 Jari Häkkinen, Markus Ringnér, Peter Johansson
8  Copyright (C) 2007, 2008 Peter Johansson
9
10  This file is part of the yat library, http://trac.thep.lu.se/yat
11
12  The yat library is free software; you can redistribute it and/or
13  modify it under the terms of the GNU General Public License as
14  published by the Free Software Foundation; either version 2 of the
15  License, or (at your option) any later version.
16
17  The yat library is distributed in the hope that it will be useful,
18  but WITHOUT ANY WARRANTY; without even the implied warranty of
19  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20  General Public License for more details.
21
22  You should have received a copy of the GNU General Public License
23  along with this program; if not, write to the Free Software
24  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
25  02111-1307, USA.
26*/
27
28#include "FeatureSelector.h"
29#include "KernelLookup.h"
30#include "MatrixLookup.h"
31#include "MatrixLookupWeighted.h"
32#include "Target.h"
33#include "Sampler.h"
34#include "yat/utility/Index.h"
35#include "yat/utility/yat_assert.h"
36
37#include <algorithm>
38#include <cassert>
39#include <utility>
40#include <typeinfo>
41#include <vector>
42
43namespace theplu {
44namespace yat {
45namespace classifier { 
46  ///
47  /// @brief Class splitting Data into training and validation set.
48  ///
49  /// A SubsetGenerator splits a Data into several training and
50  /// validation data. A Sampler is used to select samples for a
51  /// training Data set and a validation Data set, respectively. In
52  /// addition a FeatureSelector can be used to select Features. For
53  /// more details see constructors.
54  ///
55  /// \note Data must be one of MatrixLookup, MatrixLookupWeighted, or
56  /// KernelLookup.
57  ///
58  template <typename Data> 
59  class SubsetGenerator
60  {
61  public:
62    /**
63       type of Data that is stored in SubsetGenerator
64     */
65    typedef Data value_type;
66
67    ///
68    /// @brief Create SubDataSets
69    /// 
70    /// Creates N training data sets and N validation data sets, where
71    /// N equals the size of \a sampler. Data must be one of
72    /// MatrixLookup, MatrixLookupWeighted, or KernelLookup.
73    ///
74    /// In case of MatrixLookup or MatrixLookupWeighted, each column
75    /// corresponds to a sample and the \a sampler is used to select
76    /// columns. Sampler::training_index(size_t) is used to select
77    /// columns for the corresponding traing_data, and
78    /// Sampler::validation_index(size_t) is used to select columns
79    /// for the corresponding validation_data.
80    ///
81    /// In case of a KernelLookup it is a bit different. A symmetric
82    /// training kernel is created using
83    /// Sampler::training_index(size_t) to select rows and
84    /// columns. The validation kernel is typically not symmetric, but
85    /// the columns correspond to a validation sample and each row
86    /// corresponds to a training sample. Consequently
87    /// Sampler::training_index(size_t) is used to select rows, and
88    /// Sampler::validation_index(size_t) is used to select columns.
89    ///
90    /// @param sampler Sampler that is used to select samples.
91    /// @param data Data to split up in validation and training.
92    ///
93    SubsetGenerator(const Sampler& sampler, const Data& data);
94
95    ///
96    /// @brief Create SubDataSets with feature selection
97    /// 
98    /// Creates N training data sets and N validation data sets, where
99    /// N equals the size of \a sampler. The Sampler defines which
100    /// samples are included in a subset. Likewise a FeatureSelector,
101    /// \a fs, is used to select features. The selection is based on
102    /// not based on the entire dataset but solely on the training
103    /// dataset. Data must be one of MatrixLookup,
104    /// MatrixLookupWeighted, or KernelLookup.
105    ///
106    /// In case of MatrixLookup or MatrixLookupWeighted, each column
107    /// corresponds to a sample and the \a sampler is used to select
108    /// columns. Sampler::training_index(size_t) is used to select
109    /// columns for the corresponding traing_data, and
110    /// Sampler::validation_index(size_t) is used to select columns
111    /// for the corresponding validation_data. The FeatureSelector is
112    /// used to select features, i.e., to select rows to be included
113    /// in the subsets.
114    ///
115    /// In case of a KernelLookup it is a bit different. A symmetric
116    /// training kernel is created using
117    /// Sampler::training_index(size_t) to select rows and
118    /// columns. However, the created KernelLookup is not simply the
119    /// subkernel of \a data, but each element is recalculated using
120    /// the features selected by FeatureSelector \a fs. In the
121    /// validation kernel each column corresponds to a validation
122    /// sample and each row corresponds to a training
123    /// sample. Consequently Sampler::training_index(size_t) is used
124    /// to select rows, and Sampler::validation_index(size_t) is used
125    /// to select columns. The same set of features are used to
126    /// caclulate the elements as for the training kernel, i.e.,
127    /// feature selection is based on training data.
128    ///
129    /// @param sampler taking care of partioning dataset
130    /// @param data data to be split up in validation and training.
131    /// @param fs Object selecting features for each subset
132    ///
133    SubsetGenerator(const Sampler& sampler, const Data& data, 
134                    FeatureSelector& fs);
135
136    ///
137    /// Destructor
138    ///
139    ~SubsetGenerator();
140 
141    ///
142    /// @return number of subsets
143    ///
144    size_t size(void) const;
145
146    ///
147    /// @return the target for the total set
148    ///
149    const Target& target(void) const;
150
151    ///
152    /// See constructors for details on how training data are
153    /// generated.
154    ///
155    /// @return ith training data
156    ///
157    const Data& training_data(size_t i) const;
158
159    ///
160    /// Features that are used to create ith training data and
161    /// validation data.
162    ///
163    /// @return training features
164    ///
165    const utility::Index& training_features(size_t i) const;
166
167    ///
168    /// @return Index of samples included in ith training samples.
169    ///
170    const utility::Index& training_index(size_t i) const;
171
172    ///
173    /// @return Targets of ith set of training samples
174    ///
175    const Target& training_target(size_t i) const;
176
177    ///
178    /// See constructors for details on how validation data are
179    /// generated.
180    ///
181    /// @return ith validation data
182    ///
183    const Data& validation_data(size_t i) const;
184
185    ///
186    /// @return Index of samples included in ith validation samples.
187    ///
188    const utility::Index& validation_index(size_t i) const;
189
190    ///
191    /// @return Targets of ith set validation samples
192    ///
193    const Target& validation_target(size_t i) const;
194
195  private:
196    void build(const MatrixLookup&);
197    void build(const MatrixLookupWeighted&);
198    void build(const KernelLookup&);
199
200    SubsetGenerator(const SubsetGenerator&);
201    const SubsetGenerator& operator=(const SubsetGenerator&) const;
202
203    FeatureSelector* f_selector_;
204    std::vector<utility::Index > features_;
205    const Sampler& sampler_;
206    std::vector<const Data*> training_data_;
207    std::vector<Target> training_target_;
208    std::vector<const Data*> validation_data_;
209    std::vector<Target> validation_target_;
210
211  };
212
213
214  // templates
215
216  template<typename Data>
217  SubsetGenerator<Data>::SubsetGenerator(const Sampler& sampler, 
218                                         const Data& data)
219    : f_selector_(NULL), sampler_(sampler)
220  { 
221    utility::yat_assert<std::runtime_error>(target().size()==data.columns());
222
223    training_data_.reserve(sampler_.size());
224    validation_data_.reserve(sampler_.size());
225    build(data);
226    utility::yat_assert<std::runtime_error>(training_data_.size()==size());
227    utility::yat_assert<std::runtime_error>(training_target_.size()==size());
228    utility::yat_assert<std::runtime_error>(validation_data_.size()==size());
229    utility::yat_assert<std::runtime_error>(validation_target_.size()==size());
230  }
231
232
233  template<typename Data>
234  SubsetGenerator<Data>::SubsetGenerator(const Sampler& sampler, 
235                                      const Data& data, 
236                                      FeatureSelector& fs)
237    : f_selector_(&fs), sampler_(sampler)
238  { 
239    utility::yat_assert<std::runtime_error>(target().size()==data.columns());
240    features_.reserve(size());
241    training_data_.reserve(size());
242    validation_data_.reserve(size());
243    build(data);
244    utility::yat_assert<std::runtime_error>(training_data_.size()==size());
245    utility::yat_assert<std::runtime_error>(training_target_.size()==size());
246    utility::yat_assert<std::runtime_error>(validation_data_.size()==size());
247    utility::yat_assert<std::runtime_error>(validation_target_.size()==size());
248  }
249
250
251  template<typename Data>
252  SubsetGenerator<Data>::~SubsetGenerator()
253  {
254    utility::yat_assert<std::runtime_error>(training_data_.size()==validation_data_.size());
255    for (size_t i=0; i<training_data_.size(); i++) 
256      delete training_data_[i];
257    for (size_t i=0; i<validation_data_.size(); i++) 
258      delete validation_data_[i];
259  }
260
261
262  template<typename Data>
263  void SubsetGenerator<Data>::build(const MatrixLookup& ml)
264  {
265    if (!f_selector_)// no feature selection
266      features_.push_back(utility::Index(ml.rows()));
267
268    for (size_t k=0; k<size(); k++){
269      training_target_.push_back(Target(target(),training_index(k)));
270      validation_target_.push_back(Target(target(),validation_index(k)));
271      if (f_selector_){
272        // training data with no feature selection
273        const MatrixLookup* train_data_all_feat = 
274          new MatrixLookup(ml, training_index(k), false);
275        // use these data to create feature selection
276        utility::yat_assert<std::runtime_error>(train_data_all_feat);
277        f_selector_->update(*train_data_all_feat, training_target(k));
278        // get features
279        features_.push_back(f_selector_->features());
280        utility::yat_assert<std::runtime_error>(train_data_all_feat);
281        delete train_data_all_feat;
282      }
283     
284      // Dynamically allocated. Must be deleted in destructor.
285      training_data_.push_back(new MatrixLookup(ml,features_.back(), 
286                                                training_index(k)));
287      validation_data_.push_back(new MatrixLookup(ml,features_.back(), 
288                                                  validation_index(k)));     
289    }
290
291  }
292
293
294  template<typename Data>
295  void SubsetGenerator<Data>::build(const MatrixLookupWeighted& ml)
296  {
297    if (!f_selector_)// no feature selection
298      features_.push_back(utility::Index(ml.rows()));
299
300    for (u_long k=0; k<size(); k++){
301      training_target_.push_back(Target(target(),training_index(k)));
302      validation_target_.push_back(Target(target(),validation_index(k)));
303      if (f_selector_){
304        // training data with no feature selection
305        const MatrixLookupWeighted* train_data_all_feat = 
306          new MatrixLookupWeighted(ml, training_index(k), false);
307        // use these data to create feature selection
308        f_selector_->update(*train_data_all_feat, training_target(k));
309        // get features
310        features_.push_back(f_selector_->features());
311        delete train_data_all_feat;
312      }
313
314
315      // Dynamically allocated. Must be deleted in destructor.
316      training_data_.push_back(new MatrixLookupWeighted(ml, features_.back(), 
317                                                        training_index(k)));
318      validation_data_.push_back(new MatrixLookupWeighted(ml, features_.back(), 
319                                                          validation_index(k)));
320    }
321  }
322
323  template<typename Data>
324  void SubsetGenerator<Data>::build(const KernelLookup& kernel)
325  {
326    for (u_long k=0; k<size(); k++){
327      training_target_.push_back(Target(target(),training_index(k)));
328      validation_target_.push_back(Target(target(),validation_index(k)));
329
330      if (f_selector_){
331        if (kernel.weighted()){
332          MatrixLookupWeighted ml = kernel.data_weighted();
333          f_selector_->update(MatrixLookupWeighted(ml,training_index(k),false), 
334                              training_target(k));
335        }
336        else {
337          MatrixLookup ml=kernel.data();
338          f_selector_->update(MatrixLookup(ml,training_index(k), false), 
339                              training_target(k));
340        } 
341        features_.push_back(f_selector_->features());
342        KernelLookup kl = kernel.selected(features_.back());
343        // Dynamically allocated. Must be deleted in destructor.
344        training_data_.push_back(new KernelLookup(kl,training_index(k),
345                                                  training_index(k)));
346        validation_data_.push_back(new KernelLookup(kl, training_index(k), 
347                                                    validation_index(k)));
348      }
349      else {// no feature selection
350        training_data_.push_back(new KernelLookup(kernel, training_index(k),
351                                                  training_index(k)));
352        validation_data_.push_back(new KernelLookup(kernel, 
353                                                    training_index(k), 
354                                                    validation_index(k)));
355      }
356     
357    }
358    if (!f_selector_){
359      if (kernel.weighted())
360        features_.push_back(utility::Index(kernel.data_weighted().rows()));
361      else
362        features_.push_back(utility::Index(kernel.data().rows()));
363    }
364  }
365
366
367  template<typename Data>
368  size_t SubsetGenerator<Data>::size(void) const
369  {
370    return sampler_.size();
371  }
372
373
374  template<typename Data>
375  const Target& SubsetGenerator<Data>::target(void) const
376  {
377    return sampler_.target();
378  }
379
380
381  template<typename Data>
382  const Data&
383  SubsetGenerator<Data>::training_data(size_t i) const 
384  {
385    return *(training_data_[i]);
386  }
387
388
389  template<typename Data>
390  const utility::Index&
391  SubsetGenerator<Data>::training_features(size_t i) const
392  {
393    utility::yat_assert<std::runtime_error>(features_.size(),
394                                           "SubsetGenerator::training_features");
395    return f_selector_ ? features_[i] : features_[0];
396  }
397
398
399  template<typename Data>
400  const utility::Index&
401  SubsetGenerator<Data>::training_index(size_t i) const
402  {
403    return sampler_.training_index(i);
404  }
405
406
407  template<typename Data>
408  const Target&
409  SubsetGenerator<Data>::training_target(size_t i) const
410  {
411    return training_target_[i];
412  }
413
414
415  template<typename Data>
416  const Data&
417  SubsetGenerator<Data>::validation_data(size_t i) const
418  {
419    return *(validation_data_[i]);
420  }
421
422
423  template<typename Data>
424  const utility::Index&
425  SubsetGenerator<Data>::validation_index(size_t i) const
426  {
427    return sampler_.validation_index(i);
428  }
429
430
431  template<typename Data>
432  const Target&
433  SubsetGenerator<Data>::validation_target(size_t i) const
434  {
435    return validation_target_[i];
436  }
437
438}}} // of namespace classifier, yat, and theplu
439
440#endif
441
Note: See TracBrowser for help on using the repository browser.