source: trunk/yat/classifier/SVM.h @ 1100

Last change on this file since 1100 was 1100, checked in by Peter, 14 years ago

fixes #313 - SVM constructor is void and passing kernel and target in train function instead

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date ID
File size: 7.1 KB
Line 
1#ifndef _theplu_yat_classifier_svm_
2#define _theplu_yat_classifier_svm_
3
4// $Id$
5
6/*
7  Copyright (C) 2004, 2005 Jari Häkkinen, Peter Johansson
8  Copyright (C) 2006 Jari Häkkinen, Markus Ringnér, Peter Johansson
9  Copyright (C) 2007 Peter Johansson
10
11  This file is part of the yat library, http://trac.thep.lu.se/yat
12
13  The yat library is free software; you can redistribute it and/or
14  modify it under the terms of the GNU General Public License as
15  published by the Free Software Foundation; either version 2 of the
16  License, or (at your option) any later version.
17
18  The yat library is distributed in the hope that it will be useful,
19  but WITHOUT ANY WARRANTY; without even the implied warranty of
20  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21  General Public License for more details.
22
23  You should have received a copy of the GNU General Public License
24  along with this program; if not, write to the Free Software
25  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
26  02111-1307, USA.
27*/
28
29#include "KernelLookup.h"
30#include "SVindex.h"
31#include "Target.h"
32#include "yat/utility/vector.h"
33
34#include <utility>
35#include <vector>
36
37namespace theplu {
38namespace yat {
39namespace classifier { 
40
41  class DataLookup2D;
42  class Target;
43
44  ///
45  /// @brief Support Vector Machine
46  ///
47  ///
48  ///
49  /// Class for SVM using Keerthi's second modification of Platt's
50  /// Sequential Minimal Optimization. The SVM uses all data given for
51  /// training. If validation or testing is wanted this should be
52  /// taken care of outside (in the kernel).
53  ///   
54  class SVM
55  {
56 
57  public:
58    ///
59    /// \brief Constructor
60    ///
61    SVM(void);
62
63    ///
64    /// Destructor
65    ///
66    virtual ~SVM();
67
68    const DataLookup2D& data(void) const;
69
70    ///
71    ///
72    ///
73    SVM* make_classifier(void) const;
74
75    ///
76    /// @return \f$ \alpha \f$
77    ///
78    const utility::vector& alpha(void) const;
79
80    ///
81    /// The C-parameter is the balance term (see train()). A very
82    /// large C means the training will be focused on getting samples
83    /// correctly classified, with risk for overfitting and poor
84    /// generalisation. A too small C will result in a training in which
85    /// misclassifications are not penalized. C is weighted with
86    /// respect to the size, so \f$ n_+C_+ = n_-C_- \f$, meaning a
87    /// misclassificaion of the smaller group is penalized
88    /// harder. This balance is equivalent to the one occuring for
89    /// regression with regularisation, or ANN-training with a
90    /// weight-decay term. Default is C set to infinity.
91    ///
92    /// @returns mean of vector \f$ C_i \f$
93    ///
94    double C(void) const;
95
96    ///
97    /// Default is max_epochs set to 100,000.
98    ///
99    /// @return number of maximal epochs
100    ///
101    long int max_epochs(void) const;
102   
103    /**
104      \brief set maximal number of epochs in training
105    */
106    void max_epochs(long int);
107   
108    /**
109        The output is calculated as \f$ o_i = \sum \alpha_j t_j K_{ij}
110        + bias \f$, where \f$ t \f$ is the target.
111   
112        @return output
113    */ 
114    const theplu::yat::utility::vector& output(void) const;
115
116    /**
117       Generate prediction @a predict from @a input. The prediction
118       is calculated as the output times the margin, i.e., geometric
119       distance from decision hyperplane: \f$ \frac{ \sum \alpha_j
120       t_j K_{ij} + bias}{w} \f$ The output has 2 rows. The first row
121       is for binary target true, and the second is for binary target
122       false. The second row is superfluous as it is the first row
123       negated. It exist just to be aligned with multi-class
124       SupervisedClassifiers. Each column in @a input and @a output
125       corresponds to a sample to predict. Each row in @a input
126       corresponds to a training sample, and more exactly row i in @a
127       input should correspond to row i in KernelLookup that was used
128       for training.
129    */
130    void predict(const DataLookup2D& input, utility::matrix& predict) const;
131
132    ///
133    /// @return output times margin (i.e. geometric distance from
134    /// decision hyperplane) from data @a input
135    ///
136    double predict(const DataLookup1D& input) const;
137
138    ///
139    /// @return output times margin from data @a input with
140    /// corresponding @a weight
141    ///
142    double predict(const DataLookupWeighted1D& input) const;
143
144    ///
145    /// @brief sets the C-Parameter
146    ///
147    void set_C(const double);
148
149    /**
150       Training the SVM following Platt's SMO, with Keerti's
151       modifacation. Minimizing \f$ \frac{1}{2}\sum
152       y_iy_j\alpha_i\alpha_j(K_{ij}+\frac{1}{C_i}\delta_{ij}) - \sum
153       alpha_i\f$ , which corresponds to minimizing \f$ \sum
154       w_i^2+\sum C_i\xi_i^2 \f$.
155
156       @note If the training problem is not linearly separable and C
157       is set to infinity, the minima will be located in the infinity,
158       and thus the minimum will not be reached within the maximal
159       number of epochs. More exactly, when the problem is not
160       linearly separable, there exists an eigenvector to \f$
161       H_{ij}=y_iy_jK_{ij} \f$ within the space defined by the
162       conditions: \f$ \alpha_i>0 \f$ and \f$ \sum \alpha_i y_i = 0
163       \f$. As the eigenvalue is zero in this direction the quadratic
164       term does not contribute to the objective, but the objective
165       only consists of the linear term and hence there is no
166       minumum. This problem only occurs when \f$ C \f$ is set to
167       infinity because for a finite \f$ C \f$ all eigenvalues are
168       finite. However, for a large \f$ C \f$ (and training problem is
169       non-linearly separable) there exists an eigenvector
170       corresponding to a small eigenvalue, which means the minima has
171       moved from infinity to "very far away". In practice this will
172       also result in that the minima is not reached withing the
173       maximal number of epochs and the of \f$ C \f$ should be
174       decreased.
175
176       \throw if maximal number of epoch is reach.
177 
178       @return true if succesful
179    */
180    void train(const KernelLookup& kernel, const Target& target);
181
182       
183     
184  private:
185    ///
186    /// Copy constructor. (not implemented)
187    ///
188    SVM(const SVM&);
189         
190    ///
191    /// Calculates bounds for alpha2
192    ///
193    void bounds(double&, double&) const;
194
195    ///
196    /// @brief calculates the bias term
197    ///
198    /// @return true if successful
199    ///
200    void calculate_bias(void);
201
202    ///
203    /// Calculate margin that is inverse of w
204    ///
205    void calculate_margin(void);
206
207    ///
208    ///   Private function choosing which two elements that should be
209    ///   updated. First checking for the biggest violation (output - target =
210    ///   0) among support vectors (alpha!=0). If no violation was found check
211    ///   sequentially among the other samples. If no violation there as
212    ///   well training is completed
213    ///
214    ///  @return true if a pair of samples that violate the conditions
215    ///  can be found
216    ///
217    bool choose(const theplu::yat::utility::vector&);
218
219    ///
220    /// @return kernel modified with diagonal term (soft margin)
221    ///
222    double kernel_mod(const size_t i, const size_t j) const;
223
224    ///
225    /// @return 1 if i belong to binary target true else -1
226    ///
227    int target(size_t i) const;
228
229    utility::vector alpha_;
230    double bias_;
231    double C_inverse_;
232    const KernelLookup* kernel_; 
233    double margin_;
234    unsigned long int max_epochs_;
235    utility::vector output_;
236    SVindex sample_;
237    Target target_;
238    double tolerance_;
239    bool trained_;
240
241  };
242
243}}} // of namespace classifier, yat, and theplu
244
245#endif
Note: See TracBrowser for help on using the repository browser.