source: trunk/yat/classifier/SubsetGenerator.h

Last change on this file was 4359, checked in by Peter, 5 weeks ago

update copyright years using svndigest 0.11 (in contrast to 0.10.x previously)

  • Property svn:eol-style set to native
  • Property svn:keywords set to Id
File size: 13.4 KB
Line 
1#ifndef _theplu_yat_classifier_subset_generator_
2#define _theplu_yat_classifier_subset_generator_
3
4// $Id: SubsetGenerator.h 4359 2023-08-23 01:28:46Z peter $
5
6/*
7  Copyright (C) 2006 Jari Häkkinen, Peter Johansson, Markus Ringnér
8  Copyright (C) 2007 Peter Johansson
9  Copyright (C) 2008 Jari Häkkinen, Peter Johansson
10  Copyright (C) 2009, 2010 Peter Johansson
11
12  This file is part of the yat library, http://dev.thep.lu.se/yat
13
14  The yat library is free software; you can redistribute it and/or
15  modify it under the terms of the GNU General Public License as
16  published by the Free Software Foundation; either version 3 of the
17  License, or (at your option) any later version.
18
19  The yat library is distributed in the hope that it will be useful,
20  but WITHOUT ANY WARRANTY; without even the implied warranty of
21  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
22  General Public License for more details.
23
24  You should have received a copy of the GNU General Public License
25  along with yat. If not, see <http://www.gnu.org/licenses/>.
26*/
27
28#include "FeatureSelector.h"
29#include "KernelLookup.h"
30#include "MatrixLookup.h"
31#include "MatrixLookupWeighted.h"
32#include "Target.h"
33#include "Sampler.h"
34#include "yat/utility/Index.h"
35#include "yat/utility/yat_assert.h"
36
37#include <algorithm>
38#include <utility>
39#include <typeinfo>
40#include <vector>
41
42namespace theplu {
43namespace yat {
44namespace classifier {
45  ///
46  /// @brief Class splitting Data into training and validation set.
47  ///
48  /// A SubsetGenerator splits a Data into several training and
49  /// validation data. A Sampler is used to select samples for a
50  /// training Data set and a validation Data set, respectively. In
51  /// addition a FeatureSelector can be used to select Features. For
52  /// more details see constructors.
53  ///
54  /// \note Data must be one of MatrixLookup, MatrixLookupWeighted, or
55  /// KernelLookup.
56  ///
57  template <typename Data>
58  class SubsetGenerator
59  {
60  public:
61    /**
62       type of Data that is stored in SubsetGenerator
63     */
64    typedef Data value_type;
65
66    ///
67    /// @brief Create SubDataSets
68    ///
69    /// Creates N training data sets and N validation data sets, where
70    /// N equals the size of \a sampler. Data must be one of
71    /// MatrixLookup, MatrixLookupWeighted, or KernelLookup.
72    ///
73    /// In case of MatrixLookup or MatrixLookupWeighted, each column
74    /// corresponds to a sample and the \a sampler is used to select
75    /// columns. Sampler::training_index(size_t) is used to select
76    /// columns for the corresponding traing_data, and
77    /// Sampler::validation_index(size_t) is used to select columns
78    /// for the corresponding validation_data.
79    ///
80    /// In case of a KernelLookup it is a bit different. A symmetric
81    /// training kernel is created using
82    /// Sampler::training_index(size_t) to select rows and
83    /// columns. The validation kernel is typically not symmetric, but
84    /// the columns correspond to a validation sample and each row
85    /// corresponds to a training sample. Consequently
86    /// Sampler::training_index(size_t) is used to select rows, and
87    /// Sampler::validation_index(size_t) is used to select columns.
88    ///
89    /// @param sampler Sampler that is used to select samples.
90    /// @param data Data to split up in validation and training.
91    ///
92    SubsetGenerator(const Sampler& sampler, const Data& data);
93
94    ///
95    /// @brief Create SubDataSets with feature selection
96    ///
97    /// Creates N training data sets and N validation data sets, where
98    /// N equals the size of \a sampler. The Sampler defines which
99    /// samples are included in a subset. Likewise a FeatureSelector,
100    /// \a fs, is used to select features. The selection is based on
101    /// not based on the entire dataset but solely on the training
102    /// dataset. Data must be one of MatrixLookup,
103    /// MatrixLookupWeighted, or KernelLookup.
104    ///
105    /// In case of MatrixLookup or MatrixLookupWeighted, each column
106    /// corresponds to a sample and the \a sampler is used to select
107    /// columns. Sampler::training_index(size_t) is used to select
108    /// columns for the corresponding traing_data, and
109    /// Sampler::validation_index(size_t) is used to select columns
110    /// for the corresponding validation_data. The FeatureSelector is
111    /// used to select features, i.e., to select rows to be included
112    /// in the subsets.
113    ///
114    /// In case of a KernelLookup it is a bit different. A symmetric
115    /// training kernel is created using
116    /// Sampler::training_index(size_t) to select rows and
117    /// columns. However, the created KernelLookup is not simply the
118    /// subkernel of \a data, but each element is recalculated using
119    /// the features selected by FeatureSelector \a fs. In the
120    /// validation kernel each column corresponds to a validation
121    /// sample and each row corresponds to a training
122    /// sample. Consequently Sampler::training_index(size_t) is used
123    /// to select rows, and Sampler::validation_index(size_t) is used
124    /// to select columns. The same set of features are used to
125    /// caclulate the elements as for the training kernel, i.e.,
126    /// feature selection is based on training data.
127    ///
128    /// @param sampler taking care of partioning dataset
129    /// @param data data to be split up in validation and training.
130    /// @param fs Object selecting features for each subset
131    ///
132    SubsetGenerator(const Sampler& sampler, const Data& data,
133                    FeatureSelector& fs);
134
135    ///
136    /// Destructor
137    ///
138    ~SubsetGenerator();
139
140    ///
141    /// @return number of subsets
142    ///
143    size_t size(void) const;
144
145    ///
146    /// @return the target for the total set
147    ///
148    const Target& target(void) const;
149
150    ///
151    /// See constructors for details on how training data are
152    /// generated.
153    ///
154    /// @return ith training data
155    ///
156    const Data& training_data(size_t i) const;
157
158    ///
159    /// Features that are used to create ith training data and
160    /// validation data.
161    ///
162    /// @return training features
163    ///
164    const utility::Index& training_features(size_t i) const;
165
166    ///
167    /// @return Index of samples included in ith training data.
168    ///
169    const utility::Index& training_index(size_t i) const;
170
171    ///
172    /// @return Targets of ith set of training samples
173    ///
174    const Target& training_target(size_t i) const;
175
176    ///
177    /// See constructors for details on how validation data are
178    /// generated.
179    ///
180    /// @return ith validation data
181    ///
182    const Data& validation_data(size_t i) const;
183
184    ///
185    /// @return Index of samples included in ith validation data.
186    ///
187    const utility::Index& validation_index(size_t i) const;
188
189    ///
190    /// @return Targets of ith set validation samples
191    ///
192    const Target& validation_target(size_t i) const;
193
194  private:
195    void build(const MatrixLookup&);
196    void build(const MatrixLookupWeighted&);
197    void build(const KernelLookup&);
198
199    SubsetGenerator(const SubsetGenerator&);
200    const SubsetGenerator& operator=(const SubsetGenerator&) const;
201
202    FeatureSelector* f_selector_;
203    std::vector<utility::Index > features_;
204    const Sampler& sampler_;
205    std::vector<const Data*> training_data_;
206    std::vector<Target> training_target_;
207    std::vector<const Data*> validation_data_;
208    std::vector<Target> validation_target_;
209
210  };
211
212
213  // templates
214
215  template<typename Data>
216  SubsetGenerator<Data>::SubsetGenerator(const Sampler& sampler,
217                                         const Data& data)
218    : f_selector_(NULL), sampler_(sampler)
219  {
220    YAT_ASSERT(target().size()==data.columns());
221
222    training_data_.reserve(sampler_.size());
223    validation_data_.reserve(sampler_.size());
224    build(data);
225    YAT_ASSERT(training_data_.size()==size());
226    YAT_ASSERT(training_target_.size()==size());
227    YAT_ASSERT(validation_data_.size()==size());
228    YAT_ASSERT(validation_target_.size()==size());
229  }
230
231
232  template<typename Data>
233  SubsetGenerator<Data>::SubsetGenerator(const Sampler& sampler,
234                                      const Data& data,
235                                      FeatureSelector& fs)
236    : f_selector_(&fs), sampler_(sampler)
237  {
238    YAT_ASSERT(target().size()==data.columns());
239    features_.reserve(size());
240    training_data_.reserve(size());
241    validation_data_.reserve(size());
242    build(data);
243    YAT_ASSERT(training_data_.size()==size());
244    YAT_ASSERT(training_target_.size()==size());
245    YAT_ASSERT(validation_data_.size()==size());
246    YAT_ASSERT(validation_target_.size()==size());
247  }
248
249
250  template<typename Data>
251  SubsetGenerator<Data>::~SubsetGenerator()
252  {
253    YAT_ASSERT(training_data_.size()==validation_data_.size());
254    for (size_t i=0; i<training_data_.size(); i++)
255      delete training_data_[i];
256    for (size_t i=0; i<validation_data_.size(); i++)
257      delete validation_data_[i];
258  }
259
260
261  template<typename Data>
262  void SubsetGenerator<Data>::build(const MatrixLookup& ml)
263  {
264    if (!f_selector_)// no feature selection
265      features_.push_back(utility::Index(ml.rows()));
266
267    for (size_t k=0; k<size(); k++){
268      training_target_.push_back(Target(target(),training_index(k)));
269      validation_target_.push_back(Target(target(),validation_index(k)));
270      if (f_selector_){
271        // training data with no feature selection
272        const MatrixLookup* train_data_all_feat =
273          new MatrixLookup(ml, training_index(k), false);
274        // use these data to create feature selection
275        YAT_ASSERT(train_data_all_feat);
276        f_selector_->update(*train_data_all_feat, training_target(k));
277        // get features
278        features_.push_back(f_selector_->features());
279        YAT_ASSERT(train_data_all_feat);
280        delete train_data_all_feat;
281      }
282
283      // Dynamically allocated. Must be deleted in destructor.
284      training_data_.push_back(new MatrixLookup(ml,features_.back(),
285                                                training_index(k)));
286      validation_data_.push_back(new MatrixLookup(ml,features_.back(),
287                                                  validation_index(k)));
288    }
289
290  }
291
292
293  template<typename Data>
294  void SubsetGenerator<Data>::build(const MatrixLookupWeighted& ml)
295  {
296    if (!f_selector_)// no feature selection
297      features_.push_back(utility::Index(ml.rows()));
298
299    for (unsigned long k=0; k<size(); k++){
300      training_target_.push_back(Target(target(),training_index(k)));
301      validation_target_.push_back(Target(target(),validation_index(k)));
302      if (f_selector_){
303        // training data with no feature selection
304        const MatrixLookupWeighted* train_data_all_feat =
305          new MatrixLookupWeighted(ml, utility::Index(ml.rows()),
306                                   training_index(k));
307        // use these data to create feature selection
308        f_selector_->update(*train_data_all_feat, training_target(k));
309        // get features
310        features_.push_back(f_selector_->features());
311        delete train_data_all_feat;
312      }
313
314
315      // Dynamically allocated. Must be deleted in destructor.
316      training_data_.push_back(new MatrixLookupWeighted(ml, features_.back(),
317                                                        training_index(k)));
318      validation_data_.push_back(new MatrixLookupWeighted(ml, features_.back(),
319                                                          validation_index(k)));
320    }
321  }
322
323  template<typename Data>
324  void SubsetGenerator<Data>::build(const KernelLookup& kernel)
325  {
326    for (unsigned long k=0; k<size(); k++){
327      training_target_.push_back(Target(target(),training_index(k)));
328      validation_target_.push_back(Target(target(),validation_index(k)));
329
330      if (f_selector_){
331        if (kernel.weighted()){
332          MatrixLookupWeighted ml = kernel.data_weighted();
333          f_selector_->update(MatrixLookupWeighted(ml,
334                                                   utility::Index(ml.rows()),
335                                                   training_index(k)),
336                              training_target(k));
337        }
338        else {
339          MatrixLookup ml=kernel.data();
340          f_selector_->update(MatrixLookup(ml,training_index(k), false),
341                              training_target(k));
342        }
343        features_.push_back(f_selector_->features());
344        KernelLookup kl = kernel.selected(features_.back());
345        // Dynamically allocated. Must be deleted in destructor.
346        training_data_.push_back(new KernelLookup(kl,training_index(k),
347                                                  training_index(k)));
348        validation_data_.push_back(new KernelLookup(kl, training_index(k),
349                                                    validation_index(k)));
350      }
351      else {// no feature selection
352        training_data_.push_back(new KernelLookup(kernel, training_index(k),
353                                                  training_index(k)));
354        validation_data_.push_back(new KernelLookup(kernel,
355                                                    training_index(k),
356                                                    validation_index(k)));
357      }
358
359    }
360    if (!f_selector_){
361      if (kernel.weighted())
362        features_.push_back(utility::Index(kernel.data_weighted().rows()));
363      else
364        features_.push_back(utility::Index(kernel.data().rows()));
365    }
366  }
367
368
369  template<typename Data>
370  size_t SubsetGenerator<Data>::size(void) const
371  {
372    return sampler_.size();
373  }
374
375
376  template<typename Data>
377  const Target& SubsetGenerator<Data>::target(void) const
378  {
379    return sampler_.target();
380  }
381
382
383  template<typename Data>
384  const Data&
385  SubsetGenerator<Data>::training_data(size_t i) const
386  {
387    return *(training_data_[i]);
388  }
389
390
391  template<typename Data>
392  const utility::Index&
393  SubsetGenerator<Data>::training_features(size_t i) const
394  {
395    YAT_ASSERT(features_.size());
396    return f_selector_ ? features_[i] : features_[0];
397  }
398
399
400  template<typename Data>
401  const utility::Index&
402  SubsetGenerator<Data>::training_index(size_t i) const
403  {
404    return sampler_.training_index(i);
405  }
406
407
408  template<typename Data>
409  const Target&
410  SubsetGenerator<Data>::training_target(size_t i) const
411  {
412    return training_target_[i];
413  }
414
415
416  template<typename Data>
417  const Data&
418  SubsetGenerator<Data>::validation_data(size_t i) const
419  {
420    return *(validation_data_[i]);
421  }
422
423
424  template<typename Data>
425  const utility::Index&
426  SubsetGenerator<Data>::validation_index(size_t i) const
427  {
428    return sampler_.validation_index(i);
429  }
430
431
432  template<typename Data>
433  const Target&
434  SubsetGenerator<Data>::validation_target(size_t i) const
435  {
436    return validation_target_[i];
437  }
438
439}}} // of namespace classifier, yat, and theplu
440
441#endif
Note: See TracBrowser for help on using the repository browser.