source: trunk/yat/classifier/SVM.h @ 722

Last change on this file since 722 was 722, checked in by Markus Ringnér, 15 years ago

Closes #129

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date ID
File size: 7.3 KB
Line 
1#ifndef _theplu_yat_classifier_svm_
2#define _theplu_yat_classifier_svm_
3
4// $Id$
5
6/*
7  Copyright (C) The authors contributing to this file.
8
9  This file is part of the yat library, http://lev.thep.lu.se/trac/yat
10
11  The yat library is free software; you can redistribute it and/or
12  modify it under the terms of the GNU General Public License as
13  published by the Free Software Foundation; either version 2 of the
14  License, or (at your option) any later version.
15
16  The yat library is distributed in the hope that it will be useful,
17  but WITHOUT ANY WARRANTY; without even the implied warranty of
18  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19  General Public License for more details.
20
21  You should have received a copy of the GNU General Public License
22  along with this program; if not, write to the Free Software
23  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
24  02111-1307, USA.
25*/
26
27#include "KernelLookup.h"
28#include "SupervisedClassifier.h"
29#include "SVindex.h"
30#include "Target.h"
31#include "yat/utility/vector.h"
32
33#include <utility>
34#include <vector>
35
36namespace theplu {
37namespace yat {
38namespace classifier { 
39
40  class DataLookup2D;
41  ///
42  /// @brief Support Vector Machine
43  ///
44  ///
45  ///
46  /// Class for SVM using Keerthi's second modification of Platt's
47  /// Sequential Minimal Optimization. The SVM uses all data given for
48  /// training. If validation or testing is wanted this should be
49  /// taken care of outside (in the kernel).
50  ///   
51  class SVM : public SupervisedClassifier
52  {
53 
54  public:
55    ///
56    /// Constructor taking the kernel and the target vector as
57    /// input.
58    ///
59    /// @note if the @a target or @a kernel
60    /// is destroyed the behaviour is undefined.
61    ///
62    SVM(const KernelLookup& kernel, const Target& target);
63
64    ///
65    /// Destructor
66    ///
67    virtual ~SVM();
68
69    const DataLookup2D& data(void) const;
70
71    ///
72    /// If DataLookup2D is not a KernelLookup a bad_cast exception is thrown.
73    ///
74    SupervisedClassifier* 
75    make_classifier(const DataLookup2D&, const Target&) const;
76
77    ///
78    /// @return \f$ \alpha \f$
79    ///
80    const utility::vector& alpha(void) const;
81
82    ///
83    /// The C-parameter is the balance term (see train()). A very
84    /// large C means the training will be focused on getting samples
85    /// correctly classified, with risk for overfitting and poor
86    /// generalisation. A too small C will result in a training where
87    /// misclassifications are not penalized. C is weighted with
88    /// respect to the size, so \f$ n_+C_+ = n_-C_- \f$, meaning a
89    /// misclassificaion of the smaller group is penalized
90    /// harder. This balance is equivalent to the one occuring for
91    /// regression with regularisation, or ANN-training with a
92    /// weight-decay term. Default is C set to infinity.
93    ///
94    /// @returns mean of vector \f$ C_i \f$
95    ///
96    double C(void) const;
97
98    ///
99    /// Default is max_epochs set to 10,000,000.
100    ///
101    /// @return number of maximal epochs
102    ///
103    long int max_epochs(void) const;
104   
105    /**
106        The output is calculated as \f$ o_i = \sum \alpha_j t_j K_{ij}
107        + bias \f$, where \f$ t \f$ is the target.
108   
109        @return output
110    */ 
111    const theplu::yat::utility::vector& output(void) const;
112
113    /**
114       Generate prediction @a predict from @a input. The prediction
115       is calculated as the output times the margin, i.e., geometric
116       distance from decision hyperplane: \f$ \frac{ \sum \alpha_j
117       t_j K_{ij} + bias}{w} \f$ The output has 2 rows. The first row
118       is for binary target true, and the second is for binary target
119       false. The second row is superfluous as it is the first row
120       negated. It exist just to be aligned with multi-class
121       SupervisedClassifiers. Each column in @a input and @a output
122       corresponds to a sample to predict. Each row in @a input
123       corresponds to a training sample, and more exactly row i in @a
124       input should correspond to row i in KernelLookup that was used
125       for training.
126    */
127    void predict(const DataLookup2D& input, utility::matrix& predict) const;
128
129    ///
130    /// @return output times margin (i.e. geometric distance from
131    /// decision hyperplane) from data @a input
132    ///
133    double predict(const DataLookup1D& input) const;
134
135    ///
136    /// @return output times margin from data @a input with
137    /// corresponding @a weight
138    ///
139    double predict(const DataLookupWeighted1D& input) const;
140
141    ///
142    /// @brief Function sets \f$ \alpha=0 \f$ and makes SVM untrained.
143    ///
144    void reset(void);
145
146    ///
147    /// @brief sets the C-Parameter
148    ///
149    void set_C(const double);
150
151    /**
152       Training the SVM following Platt's SMO, with Keerti's
153       modifacation. Minimizing \f$ \frac{1}{2}\sum
154       y_iy_j\alpha_i\alpha_j(K_{ij}+\frac{1}{C_i}\delta_{ij}) - \sum
155       alpha_i\f$ , which corresponds to minimizing \f$ \sum
156       w_i^2+\sum C_i\xi_i^2 \f$.
157
158       @note If the training problem is not linearly separable and C
159       is set to infinity, the minima will be located in the infinity,
160       and thus the minumum will not be reached within the maximal
161       number of epochs. More exactly, when the problem is not
162       linearly separable, there exists an eigenvector to \f$
163       H_{ij}=y_iy_jK_{ij} \f$ within the space defined by the
164       conditions: \f$ \alpha_i>0 \f$ and \f$ \sum \alpha_i y_i = 0
165       \f$. As the eigenvalue is zero in this direction the quadratic
166       term does not contribute to the objective, but the objective
167       only consists of the linear term and hence there is no
168       minumum. This problem only occurs when \f$ C \f$ is set to
169       infinity because for a finite \f$ C \f$ all eigenvalues are
170       finite. However, for a large \f$ C \f$ (and training problem is
171       non-linearly separable) there exists an eigenvector
172       corresponding to a small eigenvalue, which means the minima has
173       moved from infinity to "very far away". In practice this will
174       also result in that the minima is not reached withing the
175       maximal number of epochs and the of \f$ C \f$ should be
176       decreased.
177 
178       @return true if succesful
179    */
180    bool train();
181
182       
183     
184  private:
185    ///
186    /// Copy constructor. (not implemented)
187    ///
188    SVM(const SVM&);
189         
190    ///
191    /// Calculates bounds for alpha2
192    ///
193    void bounds(double&, double&) const;
194
195    ///
196    /// @brief calculates the bias term
197    ///
198    /// @return true if successful
199    ///
200    bool calculate_bias(void);
201
202    ///
203    /// Calculate margin that is inverse of w
204    ///
205    void calculate_margin(void);
206
207    ///
208    ///   Private function choosing which two elements that should be
209    ///   updated. First checking for the biggest violation (output - target =
210    ///   0) among support vectors (alpha!=0). If no violation was found check
211    ///   sequentially among the other samples. If no violation there as
212    ///   well training is completed
213    ///
214    ///  @return true if a pair of samples that violate the conditions
215    ///  can be found
216    ///
217    bool choose(const theplu::yat::utility::vector&);
218
219    ///
220    /// @return kernel modified with diagonal term (soft margin)
221    ///
222    double kernel_mod(const size_t i, const size_t j) const;
223
224    ///
225    /// @return 1 if i belong to binary target true else -1
226    ///
227    int target(size_t i) const;
228
229    utility::vector alpha_;
230    double bias_;
231    double C_inverse_;
232    const KernelLookup* kernel_; 
233    double margin_;
234    unsigned long int max_epochs_;
235    utility::vector output_;
236    bool owner_;
237    SVindex sample_;
238    bool trained_;
239    double tolerance_;
240
241  };
242
243}}} // of namespace classifier, yat, and theplu
244
245#endif
Note: See TracBrowser for help on using the repository browser.