source: trunk/yat/classifier/SubsetGenerator.h @ 1580

Last change on this file since 1580 was 1487, checked in by Jari Häkkinen, 13 years ago

Addresses #436. GPL license copy reference should also be updated.

  • Property svn:eol-style set to native
  • Property svn:keywords set to Id
File size: 13.7 KB
Line 
1#ifndef _theplu_yat_classifier_subset_generator_
2#define _theplu_yat_classifier_subset_generator_
3
4// $Id: SubsetGenerator.h 1487 2008-09-10 08:41:36Z jari $
5
6/*
7  Copyright (C) 2006 Jari Häkkinen, Peter Johansson, Markus Ringnér
8  Copyright (C) 2007, 2008 Jari Häkkinen, Peter Johansson
9
10  This file is part of the yat library, http://dev.thep.lu.se/yat
11
12  The yat library is free software; you can redistribute it and/or
13  modify it under the terms of the GNU General Public License as
14  published by the Free Software Foundation; either version 3 of the
15  License, or (at your option) any later version.
16
17  The yat library is distributed in the hope that it will be useful,
18  but WITHOUT ANY WARRANTY; without even the implied warranty of
19  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20  General Public License for more details.
21
22  You should have received a copy of the GNU General Public License
23  along with yat. If not, see <http://www.gnu.org/licenses/>.
24*/
25
26#include "FeatureSelector.h"
27#include "KernelLookup.h"
28#include "MatrixLookup.h"
29#include "MatrixLookupWeighted.h"
30#include "Target.h"
31#include "Sampler.h"
32#include "yat/utility/Index.h"
33#include "yat/utility/yat_assert.h"
34
35#include <algorithm>
36#include <cassert>
37#include <utility>
38#include <typeinfo>
39#include <vector>
40
41namespace theplu {
42namespace yat {
43namespace classifier { 
44  ///
45  /// @brief Class splitting Data into training and validation set.
46  ///
47  /// A SubsetGenerator splits a Data into several training and
48  /// validation data. A Sampler is used to select samples for a
49  /// training Data set and a validation Data set, respectively. In
50  /// addition a FeatureSelector can be used to select Features. For
51  /// more details see constructors.
52  ///
53  /// \note Data must be one of MatrixLookup, MatrixLookupWeighted, or
54  /// KernelLookup.
55  ///
56  template <typename Data> 
57  class SubsetGenerator
58  {
59  public:
60    /**
61       type of Data that is stored in SubsetGenerator
62     */
63    typedef Data value_type;
64
65    ///
66    /// @brief Create SubDataSets
67    /// 
68    /// Creates N training data sets and N validation data sets, where
69    /// N equals the size of \a sampler. Data must be one of
70    /// MatrixLookup, MatrixLookupWeighted, or KernelLookup.
71    ///
72    /// In case of MatrixLookup or MatrixLookupWeighted, each column
73    /// corresponds to a sample and the \a sampler is used to select
74    /// columns. Sampler::training_index(size_t) is used to select
75    /// columns for the corresponding traing_data, and
76    /// Sampler::validation_index(size_t) is used to select columns
77    /// for the corresponding validation_data.
78    ///
79    /// In case of a KernelLookup it is a bit different. A symmetric
80    /// training kernel is created using
81    /// Sampler::training_index(size_t) to select rows and
82    /// columns. The validation kernel is typically not symmetric, but
83    /// the columns correspond to a validation sample and each row
84    /// corresponds to a training sample. Consequently
85    /// Sampler::training_index(size_t) is used to select rows, and
86    /// Sampler::validation_index(size_t) is used to select columns.
87    ///
88    /// @param sampler Sampler that is used to select samples.
89    /// @param data Data to split up in validation and training.
90    ///
91    SubsetGenerator(const Sampler& sampler, const Data& data);
92
93    ///
94    /// @brief Create SubDataSets with feature selection
95    /// 
96    /// Creates N training data sets and N validation data sets, where
97    /// N equals the size of \a sampler. The Sampler defines which
98    /// samples are included in a subset. Likewise a FeatureSelector,
99    /// \a fs, is used to select features. The selection is based on
100    /// not based on the entire dataset but solely on the training
101    /// dataset. Data must be one of MatrixLookup,
102    /// MatrixLookupWeighted, or KernelLookup.
103    ///
104    /// In case of MatrixLookup or MatrixLookupWeighted, each column
105    /// corresponds to a sample and the \a sampler is used to select
106    /// columns. Sampler::training_index(size_t) is used to select
107    /// columns for the corresponding traing_data, and
108    /// Sampler::validation_index(size_t) is used to select columns
109    /// for the corresponding validation_data. The FeatureSelector is
110    /// used to select features, i.e., to select rows to be included
111    /// in the subsets.
112    ///
113    /// In case of a KernelLookup it is a bit different. A symmetric
114    /// training kernel is created using
115    /// Sampler::training_index(size_t) to select rows and
116    /// columns. However, the created KernelLookup is not simply the
117    /// subkernel of \a data, but each element is recalculated using
118    /// the features selected by FeatureSelector \a fs. In the
119    /// validation kernel each column corresponds to a validation
120    /// sample and each row corresponds to a training
121    /// sample. Consequently Sampler::training_index(size_t) is used
122    /// to select rows, and Sampler::validation_index(size_t) is used
123    /// to select columns. The same set of features are used to
124    /// caclulate the elements as for the training kernel, i.e.,
125    /// feature selection is based on training data.
126    ///
127    /// @param sampler taking care of partioning dataset
128    /// @param data data to be split up in validation and training.
129    /// @param fs Object selecting features for each subset
130    ///
131    SubsetGenerator(const Sampler& sampler, const Data& data, 
132                    FeatureSelector& fs);
133
134    ///
135    /// Destructor
136    ///
137    ~SubsetGenerator();
138 
139    ///
140    /// @return number of subsets
141    ///
142    size_t size(void) const;
143
144    ///
145    /// @return the target for the total set
146    ///
147    const Target& target(void) const;
148
149    ///
150    /// See constructors for details on how training data are
151    /// generated.
152    ///
153    /// @return ith training data
154    ///
155    const Data& training_data(size_t i) const;
156
157    ///
158    /// Features that are used to create ith training data and
159    /// validation data.
160    ///
161    /// @return training features
162    ///
163    const utility::Index& training_features(size_t i) const;
164
165    ///
166    /// @return Index of samples included in ith training data.
167    ///
168    const utility::Index& training_index(size_t i) const;
169
170    ///
171    /// @return Targets of ith set of training samples
172    ///
173    const Target& training_target(size_t i) const;
174
175    ///
176    /// See constructors for details on how validation data are
177    /// generated.
178    ///
179    /// @return ith validation data
180    ///
181    const Data& validation_data(size_t i) const;
182
183    ///
184    /// @return Index of samples included in ith validation data.
185    ///
186    const utility::Index& validation_index(size_t i) const;
187
188    ///
189    /// @return Targets of ith set validation samples
190    ///
191    const Target& validation_target(size_t i) const;
192
193  private:
194    void build(const MatrixLookup&);
195    void build(const MatrixLookupWeighted&);
196    void build(const KernelLookup&);
197
198    SubsetGenerator(const SubsetGenerator&);
199    const SubsetGenerator& operator=(const SubsetGenerator&) const;
200
201    FeatureSelector* f_selector_;
202    std::vector<utility::Index > features_;
203    const Sampler& sampler_;
204    std::vector<const Data*> training_data_;
205    std::vector<Target> training_target_;
206    std::vector<const Data*> validation_data_;
207    std::vector<Target> validation_target_;
208
209  };
210
211
212  // templates
213
214  template<typename Data>
215  SubsetGenerator<Data>::SubsetGenerator(const Sampler& sampler, 
216                                         const Data& data)
217    : f_selector_(NULL), sampler_(sampler)
218  { 
219    utility::yat_assert<std::runtime_error>(target().size()==data.columns());
220
221    training_data_.reserve(sampler_.size());
222    validation_data_.reserve(sampler_.size());
223    build(data);
224    utility::yat_assert<std::runtime_error>(training_data_.size()==size());
225    utility::yat_assert<std::runtime_error>(training_target_.size()==size());
226    utility::yat_assert<std::runtime_error>(validation_data_.size()==size());
227    utility::yat_assert<std::runtime_error>(validation_target_.size()==size());
228  }
229
230
231  template<typename Data>
232  SubsetGenerator<Data>::SubsetGenerator(const Sampler& sampler, 
233                                      const Data& data, 
234                                      FeatureSelector& fs)
235    : f_selector_(&fs), sampler_(sampler)
236  { 
237    utility::yat_assert<std::runtime_error>(target().size()==data.columns());
238    features_.reserve(size());
239    training_data_.reserve(size());
240    validation_data_.reserve(size());
241    build(data);
242    utility::yat_assert<std::runtime_error>(training_data_.size()==size());
243    utility::yat_assert<std::runtime_error>(training_target_.size()==size());
244    utility::yat_assert<std::runtime_error>(validation_data_.size()==size());
245    utility::yat_assert<std::runtime_error>(validation_target_.size()==size());
246  }
247
248
249  template<typename Data>
250  SubsetGenerator<Data>::~SubsetGenerator()
251  {
252    utility::yat_assert<std::runtime_error>(training_data_.size()==validation_data_.size());
253    for (size_t i=0; i<training_data_.size(); i++) 
254      delete training_data_[i];
255    for (size_t i=0; i<validation_data_.size(); i++) 
256      delete validation_data_[i];
257  }
258
259
260  template<typename Data>
261  void SubsetGenerator<Data>::build(const MatrixLookup& ml)
262  {
263    if (!f_selector_)// no feature selection
264      features_.push_back(utility::Index(ml.rows()));
265
266    for (size_t k=0; k<size(); k++){
267      training_target_.push_back(Target(target(),training_index(k)));
268      validation_target_.push_back(Target(target(),validation_index(k)));
269      if (f_selector_){
270        // training data with no feature selection
271        const MatrixLookup* train_data_all_feat = 
272          new MatrixLookup(ml, training_index(k), false);
273        // use these data to create feature selection
274        utility::yat_assert<std::runtime_error>(train_data_all_feat);
275        f_selector_->update(*train_data_all_feat, training_target(k));
276        // get features
277        features_.push_back(f_selector_->features());
278        utility::yat_assert<std::runtime_error>(train_data_all_feat);
279        delete train_data_all_feat;
280      }
281     
282      // Dynamically allocated. Must be deleted in destructor.
283      training_data_.push_back(new MatrixLookup(ml,features_.back(), 
284                                                training_index(k)));
285      validation_data_.push_back(new MatrixLookup(ml,features_.back(), 
286                                                  validation_index(k)));     
287    }
288
289  }
290
291
292  template<typename Data>
293  void SubsetGenerator<Data>::build(const MatrixLookupWeighted& ml)
294  {
295    if (!f_selector_)// no feature selection
296      features_.push_back(utility::Index(ml.rows()));
297
298    for (unsigned long k=0; k<size(); k++){
299      training_target_.push_back(Target(target(),training_index(k)));
300      validation_target_.push_back(Target(target(),validation_index(k)));
301      if (f_selector_){
302        // training data with no feature selection
303        const MatrixLookupWeighted* train_data_all_feat = 
304          new MatrixLookupWeighted(ml, training_index(k), false);
305        // use these data to create feature selection
306        f_selector_->update(*train_data_all_feat, training_target(k));
307        // get features
308        features_.push_back(f_selector_->features());
309        delete train_data_all_feat;
310      }
311
312
313      // Dynamically allocated. Must be deleted in destructor.
314      training_data_.push_back(new MatrixLookupWeighted(ml, features_.back(), 
315                                                        training_index(k)));
316      validation_data_.push_back(new MatrixLookupWeighted(ml, features_.back(), 
317                                                          validation_index(k)));
318    }
319  }
320
321  template<typename Data>
322  void SubsetGenerator<Data>::build(const KernelLookup& kernel)
323  {
324    for (unsigned long k=0; k<size(); k++){
325      training_target_.push_back(Target(target(),training_index(k)));
326      validation_target_.push_back(Target(target(),validation_index(k)));
327
328      if (f_selector_){
329        if (kernel.weighted()){
330          MatrixLookupWeighted ml = kernel.data_weighted();
331          f_selector_->update(MatrixLookupWeighted(ml,training_index(k),false), 
332                              training_target(k));
333        }
334        else {
335          MatrixLookup ml=kernel.data();
336          f_selector_->update(MatrixLookup(ml,training_index(k), false), 
337                              training_target(k));
338        } 
339        features_.push_back(f_selector_->features());
340        KernelLookup kl = kernel.selected(features_.back());
341        // Dynamically allocated. Must be deleted in destructor.
342        training_data_.push_back(new KernelLookup(kl,training_index(k),
343                                                  training_index(k)));
344        validation_data_.push_back(new KernelLookup(kl, training_index(k), 
345                                                    validation_index(k)));
346      }
347      else {// no feature selection
348        training_data_.push_back(new KernelLookup(kernel, training_index(k),
349                                                  training_index(k)));
350        validation_data_.push_back(new KernelLookup(kernel, 
351                                                    training_index(k), 
352                                                    validation_index(k)));
353      }
354     
355    }
356    if (!f_selector_){
357      if (kernel.weighted())
358        features_.push_back(utility::Index(kernel.data_weighted().rows()));
359      else
360        features_.push_back(utility::Index(kernel.data().rows()));
361    }
362  }
363
364
365  template<typename Data>
366  size_t SubsetGenerator<Data>::size(void) const
367  {
368    return sampler_.size();
369  }
370
371
372  template<typename Data>
373  const Target& SubsetGenerator<Data>::target(void) const
374  {
375    return sampler_.target();
376  }
377
378
379  template<typename Data>
380  const Data&
381  SubsetGenerator<Data>::training_data(size_t i) const 
382  {
383    return *(training_data_[i]);
384  }
385
386
387  template<typename Data>
388  const utility::Index&
389  SubsetGenerator<Data>::training_features(size_t i) const
390  {
391    utility::yat_assert<std::runtime_error>(features_.size(),
392                                           "SubsetGenerator::training_features");
393    return f_selector_ ? features_[i] : features_[0];
394  }
395
396
397  template<typename Data>
398  const utility::Index&
399  SubsetGenerator<Data>::training_index(size_t i) const
400  {
401    return sampler_.training_index(i);
402  }
403
404
405  template<typename Data>
406  const Target&
407  SubsetGenerator<Data>::training_target(size_t i) const
408  {
409    return training_target_[i];
410  }
411
412
413  template<typename Data>
414  const Data&
415  SubsetGenerator<Data>::validation_data(size_t i) const
416  {
417    return *(validation_data_[i]);
418  }
419
420
421  template<typename Data>
422  const utility::Index&
423  SubsetGenerator<Data>::validation_index(size_t i) const
424  {
425    return sampler_.validation_index(i);
426  }
427
428
429  template<typename Data>
430  const Target&
431  SubsetGenerator<Data>::validation_target(size_t i) const
432  {
433    return validation_target_[i];
434  }
435
436}}} // of namespace classifier, yat, and theplu
437
438#endif
439
Note: See TracBrowser for help on using the repository browser.