source: trunk/yat/classifier/SVM.h @ 1108

Last change on this file since 1108 was 1108, checked in by Peter, 14 years ago

adding svm copy constructor

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date ID
File size: 7.1 KB
Line 
1#ifndef _theplu_yat_classifier_svm_
2#define _theplu_yat_classifier_svm_
3
4// $Id$
5
6/*
7  Copyright (C) 2004, 2005 Jari Häkkinen, Peter Johansson
8  Copyright (C) 2006 Jari Häkkinen, Markus Ringnér, Peter Johansson
9  Copyright (C) 2007 Peter Johansson
10
11  This file is part of the yat library, http://trac.thep.lu.se/yat
12
13  The yat library is free software; you can redistribute it and/or
14  modify it under the terms of the GNU General Public License as
15  published by the Free Software Foundation; either version 2 of the
16  License, or (at your option) any later version.
17
18  The yat library is distributed in the hope that it will be useful,
19  but WITHOUT ANY WARRANTY; without even the implied warranty of
20  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21  General Public License for more details.
22
23  You should have received a copy of the GNU General Public License
24  along with this program; if not, write to the Free Software
25  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
26  02111-1307, USA.
27*/
28
29#include "SVindex.h"
30#include "Target.h"
31#include "yat/utility/vector.h"
32
33#include <utility>
34#include <vector>
35
36namespace theplu {
37namespace yat {
38namespace classifier { 
39
40  class DataLookup1D;
41  class DataLookupWeighted1D;
42  class KernelLookup;
43
44  ///
45  /// @brief Support Vector Machine
46  ///
47  ///
48  ///
49  /// Class for SVM using Keerthi's second modification of Platt's
50  /// Sequential Minimal Optimization. The SVM uses all data given for
51  /// training. If validation or testing is wanted this should be
52  /// taken care of outside (in the kernel).
53  ///   
54  class SVM
55  {
56 
57  public:
58    ///
59    /// \brief Constructor
60    ///
61    SVM(void);
62
63    /**
64       Copy constructor.
65    */ 
66    SVM(const SVM&);
67         
68    ///
69    /// Destructor
70    ///
71    virtual ~SVM();
72
73    //const DataLookup2D& data(void) const;
74
75    ///
76    ///
77    ///
78    SVM* make_classifier(void) const;
79
80    ///
81    /// @return \f$ \alpha \f$
82    ///
83    const utility::vector& alpha(void) const;
84
85    ///
86    /// The C-parameter is the balance term (see train()). A very
87    /// large C means the training will be focused on getting samples
88    /// correctly classified, with risk for overfitting and poor
89    /// generalisation. A too small C will result in a training in which
90    /// misclassifications are not penalized. C is weighted with
91    /// respect to the size, so \f$ n_+C_+ = n_-C_- \f$, meaning a
92    /// misclassificaion of the smaller group is penalized
93    /// harder. This balance is equivalent to the one occuring for
94    /// regression with regularisation, or ANN-training with a
95    /// weight-decay term. Default is C set to infinity.
96    ///
97    /// @returns mean of vector \f$ C_i \f$
98    ///
99    double C(void) const;
100
101    ///
102    /// Default is max_epochs set to 100,000.
103    ///
104    /// @return number of maximal epochs
105    ///
106    long int max_epochs(void) const;
107   
108    /**
109      \brief set maximal number of epochs in training
110    */
111    void max_epochs(long int);
112   
113    /**
114        The output is calculated as \f$ o_i = \sum \alpha_j t_j K_{ij}
115        + bias \f$, where \f$ t \f$ is the target.
116   
117        @return output
118    */ 
119    const theplu::yat::utility::vector& output(void) const;
120
121    /**
122       Generate prediction @a predict from @a input. The prediction
123       is calculated as the output times the margin, i.e., geometric
124       distance from decision hyperplane: \f$ \frac{ \sum \alpha_j
125       t_j K_{ij} + bias}{w} \f$ The output has 2 rows. The first row
126       is for binary target true, and the second is for binary target
127       false. The second row is superfluous as it is the first row
128       negated. It exist just to be aligned with multi-class
129       SupervisedClassifiers. Each column in @a input and @a output
130       corresponds to a sample to predict. Each row in @a input
131       corresponds to a training sample, and more exactly row i in @a
132       input should correspond to row i in KernelLookup that was used
133       for training.
134    */
135    void predict(const KernelLookup& input, utility::matrix& predict) const;
136
137    ///
138    /// @return output times margin (i.e. geometric distance from
139    /// decision hyperplane) from data @a input
140    ///
141    double predict(const DataLookup1D& input) const;
142
143    ///
144    /// @return output times margin from data @a input with
145    /// corresponding @a weight
146    ///
147    double predict(const DataLookupWeighted1D& input) const;
148
149    ///
150    /// @brief sets the C-Parameter
151    ///
152    void set_C(const double);
153
154    /**
155       Training the SVM following Platt's SMO, with Keerti's
156       modifacation. Minimizing \f$ \frac{1}{2}\sum
157       y_iy_j\alpha_i\alpha_j(K_{ij}+\frac{1}{C_i}\delta_{ij}) - \sum
158       alpha_i\f$ , which corresponds to minimizing \f$ \sum
159       w_i^2+\sum C_i\xi_i^2 \f$.
160
161       @note If the training problem is not linearly separable and C
162       is set to infinity, the minima will be located in the infinity,
163       and thus the minimum will not be reached within the maximal
164       number of epochs. More exactly, when the problem is not
165       linearly separable, there exists an eigenvector to \f$
166       H_{ij}=y_iy_jK_{ij} \f$ within the space defined by the
167       conditions: \f$ \alpha_i>0 \f$ and \f$ \sum \alpha_i y_i = 0
168       \f$. As the eigenvalue is zero in this direction the quadratic
169       term does not contribute to the objective, but the objective
170       only consists of the linear term and hence there is no
171       minumum. This problem only occurs when \f$ C \f$ is set to
172       infinity because for a finite \f$ C \f$ all eigenvalues are
173       finite. However, for a large \f$ C \f$ (and training problem is
174       non-linearly separable) there exists an eigenvector
175       corresponding to a small eigenvalue, which means the minima has
176       moved from infinity to "very far away". In practice this will
177       also result in that the minima is not reached withing the
178       maximal number of epochs and the of \f$ C \f$ should be
179       decreased.
180
181       \throw if maximal number of epoch is reach.
182 
183       @return true if succesful
184    */
185    void train(const KernelLookup& kernel, const Target& target);
186
187       
188     
189  private:
190    ///
191    /// Calculates bounds for alpha2
192    ///
193    void bounds(double&, double&) const;
194
195    ///
196    /// @brief calculates the bias term
197    ///
198    /// @return true if successful
199    ///
200    void calculate_bias(void);
201
202    ///
203    /// Calculate margin that is inverse of w
204    ///
205    void calculate_margin(void);
206
207    ///
208    ///   Private function choosing which two elements that should be
209    ///   updated. First checking for the biggest violation (output - target =
210    ///   0) among support vectors (alpha!=0). If no violation was found check
211    ///   sequentially among the other samples. If no violation there as
212    ///   well training is completed
213    ///
214    ///  @return true if a pair of samples that violate the conditions
215    ///  can be found
216    ///
217    bool choose(const theplu::yat::utility::vector&);
218
219    ///
220    /// @return kernel modified with diagonal term (soft margin)
221    ///
222    double kernel_mod(const size_t i, const size_t j) const;
223
224    ///
225    /// @return 1 if i belong to binary target true else -1
226    ///
227    int target(size_t i) const;
228
229    utility::vector alpha_;
230    double bias_;
231    double C_inverse_;
232    const KernelLookup* kernel_; 
233    double margin_;
234    unsigned long int max_epochs_;
235    utility::vector output_;
236    SVindex sample_;
237    Target target_;
238    double tolerance_;
239    bool trained_;
240
241  };
242
243}}} // of namespace classifier, yat, and theplu
244
245#endif
Note: See TracBrowser for help on using the repository browser.