source: trunk/yat/classifier/SVM.h @ 706

Last change on this file since 706 was 706, checked in by Jari Häkkinen, 15 years ago

Addresses #65 and #170.

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date ID
File size: 7.5 KB
Line 
1#ifndef _theplu_yat_classifier_svm_
2#define _theplu_yat_classifier_svm_
3
4// $Id$
5
6/*
7  Copyright (C) The authors contributing to this file.
8
9  This file is part of the yat library, http://lev.thep.lu.se/trac/yat
10
11  The yat library is free software; you can redistribute it and/or
12  modify it under the terms of the GNU General Public License as
13  published by the Free Software Foundation; either version 2 of the
14  License, or (at your option) any later version.
15
16  The yat library is distributed in the hope that it will be useful,
17  but WITHOUT ANY WARRANTY; without even the implied warranty of
18  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19  General Public License for more details.
20
21  You should have received a copy of the GNU General Public License
22  along with this program; if not, write to the Free Software
23  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
24  02111-1307, USA.
25*/
26
27#include "KernelLookup.h"
28#include "SupervisedClassifier.h"
29#include "SVindex.h"
30#include "Target.h"
31#include "yat/utility/vector.h"
32
33#include <utility>
34#include <vector>
35
36namespace theplu {
37namespace yat {
38namespace classifier { 
39
40  class DataLookup2D;
41  ///
42  /// @brief Support Vector Machine
43  ///
44  ///
45  ///
46  /// Class for SVM using Keerthi's second modification of Platt's
47  /// Sequential Minimal Optimization. The SVM uses all data given for
48  /// training. If validation or testing is wanted this should be
49  /// taken care of outside (in the kernel).
50  ///   
51  class SVM : public SupervisedClassifier
52  {
53 
54  public:
55    ///
56    /// Constructor taking the kernel and the target vector as
57    /// input.
58    ///
59    /// @note if the @a target or @a kernel
60    /// is destroyed the behaviour is undefined.
61    ///
62    SVM(const KernelLookup& kernel, const Target& target);
63
64    ///
65    /// Destructor
66    ///
67    virtual ~SVM();
68
69    ///
70    /// If DataLookup2D is not a KernelLookup a bad_cast exception is thrown.
71    ///
72    SupervisedClassifier* 
73    make_classifier(const DataLookup2D&, const Target&) const;
74
75    ///
76    /// @return \f$ \alpha \f$
77    ///
78    inline const utility::vector& alpha(void) const { return alpha_; }
79
80    ///
81    /// The C-parameter is the balance term (see train()). A very
82    /// large C means the training will be focused on getting samples
83    /// correctly classified, with risk for overfitting and poor
84    /// generalisation. A too small C will result in a training where
85    /// misclassifications are not penalized. C is weighted with
86    /// respect to the size, so \f$ n_+C_+ = n_-C_- \f$, meaning a
87    /// misclassificaion of the smaller group is penalized
88    /// harder. This balance is equivalent to the one occuring for
89    /// regression with regularisation, or ANN-training with a
90    /// weight-decay term. Default is C set to infinity.
91    ///
92    /// @returns mean of vector \f$ C_i \f$
93    ///
94    inline double C(void) const { return 1.0/C_inverse_; }
95
96    ///
97    /// Default is max_epochs set to 10,000,000.
98    ///
99    /// @return number of maximal epochs
100    ///
101    inline long int max_epochs(void) const {return max_epochs_;}
102   
103    /**
104        The output is calculated as \f$ o_i = \sum \alpha_j t_j K_{ij}
105        + bias \f$, where \f$ t \f$ is the target.
106   
107        @return output
108    */ 
109    inline const theplu::yat::utility::vector&
110    output(void) const { return output_; }
111
112    /**
113       Generate prediction @a predict from @a input. The prediction
114       is calculated as the output times the margin, i.e., geometric
115       distance from decision hyperplane: \f$ \frac{ \sum \alpha_j
116       t_j K_{ij} + bias}{w} \f$ The output has 2 rows. The first row
117       is for binary target true, and the second is for binary target
118       false. The second row is superfluous as it is the first row
119       negated. It exist just to be aligned with multi-class
120       SupervisedClassifiers. Each column in @a input and @a output
121       corresponds to a sample to predict. Each row in @a input
122       corresponds to a training sample, and more exactly row i in @a
123       input should correspond to row i in KernelLookup that was used
124       for training.
125    */
126    void predict(const DataLookup2D& input, utility::matrix& predict) const;
127
128    ///
129    /// @return output times margin (i.e. geometric distance from
130    /// decision hyperplane) from data @a input
131    ///
132    double predict(const DataLookup1D& input) const;
133
134    ///
135    /// @return output times margin from data @a input with
136    /// corresponding @a weight
137    ///
138    double predict(const DataLookupWeighted1D& input) const;
139
140    ///
141    /// @brief Function sets \f$ \alpha=0 \f$ and makes SVM untrained.
142    ///
143    void reset(void);
144
145    ///
146    /// @brief sets the C-Parameter
147    ///
148    void set_C(const double);
149
150    /**
151       Training the SVM following Platt's SMO, with Keerti's
152       modifacation. Minimizing \f$ \frac{1}{2}\sum
153       y_iy_j\alpha_i\alpha_j(K_{ij}+\frac{1}{C_i}\delta_{ij}) - \sum
154       alpha_i\f$ , which corresponds to minimizing \f$ \sum
155       w_i^2+\sum C_i\xi_i^2 \f$.
156
157       @note If the training problem is not linearly separable and C
158       is set to infinity, the minima will be located in the infinity,
159       and thus the minumum will not be reached within the maximal
160       number of epochs. More exactly, when the problem is not
161       linearly separable, there exists an eigenvector to \f$
162       H_{ij}=y_iy_jK_{ij} \f$ within the space defined by the
163       conditions: \f$ \alpha_i>0 \f$ and \f$ \sum \alpha_i y_i = 0
164       \f$. As the eigenvalue is zero in this direction the quadratic
165       term does not contribute to the objective, but the objective
166       only consists of the linear term and hence there is no
167       minumum. This problem only occurs when \f$ C \f$ is set to
168       infinity because for a finite \f$ C \f$ all eigenvalues are
169       finite. However, for a large \f$ C \f$ (and training problem is
170       non-linearly separable) there exists an eigenvector
171       corresponding to a small eigenvalue, which means the minima has
172       moved from infinity to "very far away". In practice this will
173       also result in that the minima is not reached withing the
174       maximal number of epochs and the of \f$ C \f$ should be
175       decreased.
176 
177       @return true if succesful
178    */
179    bool train();
180
181       
182     
183  private:
184    ///
185    /// Copy constructor. (not implemented)
186    ///
187    SVM(const SVM&);
188         
189    ///
190    /// Calculates bounds for alpha2
191    ///
192    void bounds(double&, double&) const;
193
194    ///
195    /// @brief calculates the bias term
196    ///
197    /// @return true if successful
198    ///
199    bool calculate_bias(void);
200
201    ///
202    /// Calculate margin that is inverse of w
203    ///
204    void calculate_margin(void);
205
206    ///
207    ///   Private function choosing which two elements that should be
208    ///   updated. First checking for the biggest violation (output - target =
209    ///   0) among support vectors (alpha!=0). If no violation was found check
210    ///   sequentially among the other samples. If no violation there as
211    ///   well training is completed
212    ///
213    ///  @return true if a pair of samples that violate the conditions
214    ///  can be found
215    ///
216    bool choose(const theplu::yat::utility::vector&);
217
218    ///
219    /// @return kernel modified with diagonal term (soft margin)
220    ///
221    inline double kernel_mod(const size_t i, const size_t j) const 
222    { return i!=j ? (*kernel_)(i,j) : (*kernel_)(i,j) + C_inverse_; }
223   
224    /// @return 1 if i belong to binary target true else -1
225    inline int target(size_t i) const { return target_.binary(i) ? 1 : -1; }
226
227    utility::vector alpha_;
228    double bias_;
229    double C_inverse_;
230    const KernelLookup* kernel_; 
231    double margin_;
232    unsigned long int max_epochs_;
233    utility::vector output_;
234    bool owner_;
235    SVindex sample_;
236    bool trained_;
237    double tolerance_;
238
239  };
240
241}}} // of namespace classifier, yat, and theplu
242
243#endif
Note: See TracBrowser for help on using the repository browser.