source: trunk/yat/classifier/SubsetGenerator.h @ 1875

Last change on this file since 1875 was 1875, checked in by Peter, 13 years ago

fixes #504. Also added pp macro YAT_ASSERT that calls yat_assert with an appropriate msg

  • Property svn:eol-style set to native
  • Property svn:keywords set to Id
File size: 13.4 KB
Line 
1#ifndef _theplu_yat_classifier_subset_generator_
2#define _theplu_yat_classifier_subset_generator_
3
4// $Id: SubsetGenerator.h 1875 2009-03-19 12:35:47Z peter $
5
6/*
7  Copyright (C) 2006 Jari Häkkinen, Peter Johansson, Markus Ringnér
8  Copyright (C) 2007, 2008 Jari Häkkinen, Peter Johansson
9  Copyright (C) 2009 Peter Johansson
10
11  This file is part of the yat library, http://dev.thep.lu.se/yat
12
13  The yat library is free software; you can redistribute it and/or
14  modify it under the terms of the GNU General Public License as
15  published by the Free Software Foundation; either version 3 of the
16  License, or (at your option) any later version.
17
18  The yat library is distributed in the hope that it will be useful,
19  but WITHOUT ANY WARRANTY; without even the implied warranty of
20  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21  General Public License for more details.
22
23  You should have received a copy of the GNU General Public License
24  along with yat. If not, see <http://www.gnu.org/licenses/>.
25*/
26
27#include "FeatureSelector.h"
28#include "KernelLookup.h"
29#include "MatrixLookup.h"
30#include "MatrixLookupWeighted.h"
31#include "Target.h"
32#include "Sampler.h"
33#include "yat/utility/Index.h"
34#include "yat/utility/yat_assert.h"
35
36#include <algorithm>
37#include <cassert>
38#include <utility>
39#include <typeinfo>
40#include <vector>
41
42namespace theplu {
43namespace yat {
44namespace classifier { 
45  ///
46  /// @brief Class splitting Data into training and validation set.
47  ///
48  /// A SubsetGenerator splits a Data into several training and
49  /// validation data. A Sampler is used to select samples for a
50  /// training Data set and a validation Data set, respectively. In
51  /// addition a FeatureSelector can be used to select Features. For
52  /// more details see constructors.
53  ///
54  /// \note Data must be one of MatrixLookup, MatrixLookupWeighted, or
55  /// KernelLookup.
56  ///
57  template <typename Data> 
58  class SubsetGenerator
59  {
60  public:
61    /**
62       type of Data that is stored in SubsetGenerator
63     */
64    typedef Data value_type;
65
66    ///
67    /// @brief Create SubDataSets
68    /// 
69    /// Creates N training data sets and N validation data sets, where
70    /// N equals the size of \a sampler. Data must be one of
71    /// MatrixLookup, MatrixLookupWeighted, or KernelLookup.
72    ///
73    /// In case of MatrixLookup or MatrixLookupWeighted, each column
74    /// corresponds to a sample and the \a sampler is used to select
75    /// columns. Sampler::training_index(size_t) is used to select
76    /// columns for the corresponding traing_data, and
77    /// Sampler::validation_index(size_t) is used to select columns
78    /// for the corresponding validation_data.
79    ///
80    /// In case of a KernelLookup it is a bit different. A symmetric
81    /// training kernel is created using
82    /// Sampler::training_index(size_t) to select rows and
83    /// columns. The validation kernel is typically not symmetric, but
84    /// the columns correspond to a validation sample and each row
85    /// corresponds to a training sample. Consequently
86    /// Sampler::training_index(size_t) is used to select rows, and
87    /// Sampler::validation_index(size_t) is used to select columns.
88    ///
89    /// @param sampler Sampler that is used to select samples.
90    /// @param data Data to split up in validation and training.
91    ///
92    SubsetGenerator(const Sampler& sampler, const Data& data);
93
94    ///
95    /// @brief Create SubDataSets with feature selection
96    /// 
97    /// Creates N training data sets and N validation data sets, where
98    /// N equals the size of \a sampler. The Sampler defines which
99    /// samples are included in a subset. Likewise a FeatureSelector,
100    /// \a fs, is used to select features. The selection is based on
101    /// not based on the entire dataset but solely on the training
102    /// dataset. Data must be one of MatrixLookup,
103    /// MatrixLookupWeighted, or KernelLookup.
104    ///
105    /// In case of MatrixLookup or MatrixLookupWeighted, each column
106    /// corresponds to a sample and the \a sampler is used to select
107    /// columns. Sampler::training_index(size_t) is used to select
108    /// columns for the corresponding traing_data, and
109    /// Sampler::validation_index(size_t) is used to select columns
110    /// for the corresponding validation_data. The FeatureSelector is
111    /// used to select features, i.e., to select rows to be included
112    /// in the subsets.
113    ///
114    /// In case of a KernelLookup it is a bit different. A symmetric
115    /// training kernel is created using
116    /// Sampler::training_index(size_t) to select rows and
117    /// columns. However, the created KernelLookup is not simply the
118    /// subkernel of \a data, but each element is recalculated using
119    /// the features selected by FeatureSelector \a fs. In the
120    /// validation kernel each column corresponds to a validation
121    /// sample and each row corresponds to a training
122    /// sample. Consequently Sampler::training_index(size_t) is used
123    /// to select rows, and Sampler::validation_index(size_t) is used
124    /// to select columns. The same set of features are used to
125    /// caclulate the elements as for the training kernel, i.e.,
126    /// feature selection is based on training data.
127    ///
128    /// @param sampler taking care of partioning dataset
129    /// @param data data to be split up in validation and training.
130    /// @param fs Object selecting features for each subset
131    ///
132    SubsetGenerator(const Sampler& sampler, const Data& data, 
133                    FeatureSelector& fs);
134
135    ///
136    /// Destructor
137    ///
138    ~SubsetGenerator();
139 
140    ///
141    /// @return number of subsets
142    ///
143    size_t size(void) const;
144
145    ///
146    /// @return the target for the total set
147    ///
148    const Target& target(void) const;
149
150    ///
151    /// See constructors for details on how training data are
152    /// generated.
153    ///
154    /// @return ith training data
155    ///
156    const Data& training_data(size_t i) const;
157
158    ///
159    /// Features that are used to create ith training data and
160    /// validation data.
161    ///
162    /// @return training features
163    ///
164    const utility::Index& training_features(size_t i) const;
165
166    ///
167    /// @return Index of samples included in ith training data.
168    ///
169    const utility::Index& training_index(size_t i) const;
170
171    ///
172    /// @return Targets of ith set of training samples
173    ///
174    const Target& training_target(size_t i) const;
175
176    ///
177    /// See constructors for details on how validation data are
178    /// generated.
179    ///
180    /// @return ith validation data
181    ///
182    const Data& validation_data(size_t i) const;
183
184    ///
185    /// @return Index of samples included in ith validation data.
186    ///
187    const utility::Index& validation_index(size_t i) const;
188
189    ///
190    /// @return Targets of ith set validation samples
191    ///
192    const Target& validation_target(size_t i) const;
193
194  private:
195    void build(const MatrixLookup&);
196    void build(const MatrixLookupWeighted&);
197    void build(const KernelLookup&);
198
199    SubsetGenerator(const SubsetGenerator&);
200    const SubsetGenerator& operator=(const SubsetGenerator&) const;
201
202    FeatureSelector* f_selector_;
203    std::vector<utility::Index > features_;
204    const Sampler& sampler_;
205    std::vector<const Data*> training_data_;
206    std::vector<Target> training_target_;
207    std::vector<const Data*> validation_data_;
208    std::vector<Target> validation_target_;
209
210  };
211
212
213  // templates
214
215  template<typename Data>
216  SubsetGenerator<Data>::SubsetGenerator(const Sampler& sampler, 
217                                         const Data& data)
218    : f_selector_(NULL), sampler_(sampler)
219  { 
220    YAT_ASSERT(target().size()==data.columns());
221
222    training_data_.reserve(sampler_.size());
223    validation_data_.reserve(sampler_.size());
224    build(data);
225    YAT_ASSERT(training_data_.size()==size());
226    YAT_ASSERT(training_target_.size()==size());
227    YAT_ASSERT(validation_data_.size()==size());
228    YAT_ASSERT(validation_target_.size()==size());
229  }
230
231
232  template<typename Data>
233  SubsetGenerator<Data>::SubsetGenerator(const Sampler& sampler, 
234                                      const Data& data, 
235                                      FeatureSelector& fs)
236    : f_selector_(&fs), sampler_(sampler)
237  { 
238    YAT_ASSERT(target().size()==data.columns());
239    features_.reserve(size());
240    training_data_.reserve(size());
241    validation_data_.reserve(size());
242    build(data);
243    YAT_ASSERT(training_data_.size()==size());
244    YAT_ASSERT(training_target_.size()==size());
245    YAT_ASSERT(validation_data_.size()==size());
246    YAT_ASSERT(validation_target_.size()==size());
247  }
248
249
250  template<typename Data>
251  SubsetGenerator<Data>::~SubsetGenerator()
252  {
253    YAT_ASSERT(training_data_.size()==validation_data_.size());
254    for (size_t i=0; i<training_data_.size(); i++) 
255      delete training_data_[i];
256    for (size_t i=0; i<validation_data_.size(); i++) 
257      delete validation_data_[i];
258  }
259
260
261  template<typename Data>
262  void SubsetGenerator<Data>::build(const MatrixLookup& ml)
263  {
264    if (!f_selector_)// no feature selection
265      features_.push_back(utility::Index(ml.rows()));
266
267    for (size_t k=0; k<size(); k++){
268      training_target_.push_back(Target(target(),training_index(k)));
269      validation_target_.push_back(Target(target(),validation_index(k)));
270      if (f_selector_){
271        // training data with no feature selection
272        const MatrixLookup* train_data_all_feat = 
273          new MatrixLookup(ml, training_index(k), false);
274        // use these data to create feature selection
275        YAT_ASSERT(train_data_all_feat);
276        f_selector_->update(*train_data_all_feat, training_target(k));
277        // get features
278        features_.push_back(f_selector_->features());
279        YAT_ASSERT(train_data_all_feat);
280        delete train_data_all_feat;
281      }
282     
283      // Dynamically allocated. Must be deleted in destructor.
284      training_data_.push_back(new MatrixLookup(ml,features_.back(), 
285                                                training_index(k)));
286      validation_data_.push_back(new MatrixLookup(ml,features_.back(), 
287                                                  validation_index(k)));     
288    }
289
290  }
291
292
293  template<typename Data>
294  void SubsetGenerator<Data>::build(const MatrixLookupWeighted& ml)
295  {
296    if (!f_selector_)// no feature selection
297      features_.push_back(utility::Index(ml.rows()));
298
299    for (unsigned long k=0; k<size(); k++){
300      training_target_.push_back(Target(target(),training_index(k)));
301      validation_target_.push_back(Target(target(),validation_index(k)));
302      if (f_selector_){
303        // training data with no feature selection
304        const MatrixLookupWeighted* train_data_all_feat = 
305          new MatrixLookupWeighted(ml, training_index(k), false);
306        // use these data to create feature selection
307        f_selector_->update(*train_data_all_feat, training_target(k));
308        // get features
309        features_.push_back(f_selector_->features());
310        delete train_data_all_feat;
311      }
312
313
314      // Dynamically allocated. Must be deleted in destructor.
315      training_data_.push_back(new MatrixLookupWeighted(ml, features_.back(), 
316                                                        training_index(k)));
317      validation_data_.push_back(new MatrixLookupWeighted(ml, features_.back(), 
318                                                          validation_index(k)));
319    }
320  }
321
322  template<typename Data>
323  void SubsetGenerator<Data>::build(const KernelLookup& kernel)
324  {
325    for (unsigned long k=0; k<size(); k++){
326      training_target_.push_back(Target(target(),training_index(k)));
327      validation_target_.push_back(Target(target(),validation_index(k)));
328
329      if (f_selector_){
330        if (kernel.weighted()){
331          MatrixLookupWeighted ml = kernel.data_weighted();
332          f_selector_->update(MatrixLookupWeighted(ml,training_index(k),false), 
333                              training_target(k));
334        }
335        else {
336          MatrixLookup ml=kernel.data();
337          f_selector_->update(MatrixLookup(ml,training_index(k), false), 
338                              training_target(k));
339        } 
340        features_.push_back(f_selector_->features());
341        KernelLookup kl = kernel.selected(features_.back());
342        // Dynamically allocated. Must be deleted in destructor.
343        training_data_.push_back(new KernelLookup(kl,training_index(k),
344                                                  training_index(k)));
345        validation_data_.push_back(new KernelLookup(kl, training_index(k), 
346                                                    validation_index(k)));
347      }
348      else {// no feature selection
349        training_data_.push_back(new KernelLookup(kernel, training_index(k),
350                                                  training_index(k)));
351        validation_data_.push_back(new KernelLookup(kernel, 
352                                                    training_index(k), 
353                                                    validation_index(k)));
354      }
355     
356    }
357    if (!f_selector_){
358      if (kernel.weighted())
359        features_.push_back(utility::Index(kernel.data_weighted().rows()));
360      else
361        features_.push_back(utility::Index(kernel.data().rows()));
362    }
363  }
364
365
366  template<typename Data>
367  size_t SubsetGenerator<Data>::size(void) const
368  {
369    return sampler_.size();
370  }
371
372
373  template<typename Data>
374  const Target& SubsetGenerator<Data>::target(void) const
375  {
376    return sampler_.target();
377  }
378
379
380  template<typename Data>
381  const Data&
382  SubsetGenerator<Data>::training_data(size_t i) const 
383  {
384    return *(training_data_[i]);
385  }
386
387
388  template<typename Data>
389  const utility::Index&
390  SubsetGenerator<Data>::training_features(size_t i) const
391  {
392    utility::yat_assert<std::runtime_error>(features_.size(),
393                                           "SubsetGenerator::training_features");
394    return f_selector_ ? features_[i] : features_[0];
395  }
396
397
398  template<typename Data>
399  const utility::Index&
400  SubsetGenerator<Data>::training_index(size_t i) const
401  {
402    return sampler_.training_index(i);
403  }
404
405
406  template<typename Data>
407  const Target&
408  SubsetGenerator<Data>::training_target(size_t i) const
409  {
410    return training_target_[i];
411  }
412
413
414  template<typename Data>
415  const Data&
416  SubsetGenerator<Data>::validation_data(size_t i) const
417  {
418    return *(validation_data_[i]);
419  }
420
421
422  template<typename Data>
423  const utility::Index&
424  SubsetGenerator<Data>::validation_index(size_t i) const
425  {
426    return sampler_.validation_index(i);
427  }
428
429
430  template<typename Data>
431  const Target&
432  SubsetGenerator<Data>::validation_target(size_t i) const
433  {
434    return validation_target_[i];
435  }
436
437}}} // of namespace classifier, yat, and theplu
438
439#endif
440
Note: See TracBrowser for help on using the repository browser.