#ifndef _theplu_yat_classifier_svm_
#define _theplu_yat_classifier_svm_
// $Id$
/*
Copyright (C) 2004, 2005 Jari Häkkinen, Peter Johansson
Copyright (C) 2006 Jari Häkkinen, Markus Ringnér, Peter Johansson
Copyright (C) 2007 Peter Johansson
This file is part of the yat library, http://trac.thep.lu.se/yat
The yat library is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License as
published by the Free Software Foundation; either version 2 of the
License, or (at your option) any later version.
The yat library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
02111-1307, USA.
*/
#include "SVindex.h"
#include "Target.h"
#include "yat/utility/Vector.h"
#include
#include
namespace theplu {
namespace yat {
namespace utility{
class Matrix;
}
namespace classifier {
class DataLookup1D;
class DataLookupWeighted1D;
class KernelLookup;
/**
\brief Support Vector Machine
*/
class SVM
{
public:
///
/// \brief Constructor
///
SVM(void);
/**
\brief Copy constructor.
*/
SVM(const SVM&);
///
/// \brief Destructor
///
virtual ~SVM();
/**
\brief Create an untrained copy of SVM.
\returns A dynamically allocated SVM, which has to be deleted
by the caller to avoid memory leaks.
*/
SVM* make_classifier(void) const;
///
/// @return alpha parameters
///
const utility::Vector& alpha(void) const;
///
/// The C-parameter is the balance term (see train()). A very
/// large C means the training will be focused on getting samples
/// correctly classified, with risk for overfitting and poor
/// generalisation. A too small C will result in a training, in
/// which misclassifications are not penalized. C is weighted with
/// respect to the size such that \f$ n_+C_+ = n_-C_- \f$, meaning
/// a misclassificaion of the smaller group is penalized
/// harder. This balance is equivalent to the one occuring for
/// %regression with regularisation, or ANN-training with a
/// weight-decay term. Default is C set to infinity.
///
/// @returns mean of vector \f$ C_i \f$
///
double C(void) const;
///
/// Default is max_epochs set to 100,000.
///
/// @return number of maximal epochs
///
long int max_epochs(void) const;
/**
\brief set maximal number of epochs in training
*/
void max_epochs(long int);
/**
The output is calculated as \f$ o_i = \sum \alpha_j t_j K_{ij}
+ bias \f$, where \f$ t \f$ is the target.
@return output of training samples
*/
const theplu::yat::utility::Vector& output(void) const;
/**
Generate prediction @a predict from @a input. The prediction
is calculated as the output times the margin, i.e., geometric
distance from decision hyperplane: \f$ \frac{ \sum \alpha_j
t_j K_{ij} + bias}{|w|} \f$ The output has 2 rows. The first row
is for binary target true, and the second is for binary target
false. The second row is superfluous as it is the first row
negated. It exist just to be aligned with multi-class
SupervisedClassifiers. Each column in @a input and @a output
corresponds to a sample to predict. Each row in @a input
corresponds to a training sample, and more exactly row i in @a
input should correspond to row i in KernelLookup that was used
for training.
*/
void predict(const KernelLookup& input, utility::Matrix& predict) const;
/*
///
/// @return output times margin (i.e. geometric distance from
/// decision hyperplane) from data @a input
///
double predict(const DataLookup1D& input) const;
///
/// @return output times margin from data @a input with
/// corresponding @a weight
///
double predict(const DataLookupWeighted1D& input) const;
*/
///
/// @brief sets the C-Parameter
///
void set_C(const double);
/**
Training the SVM following Platt's SMO, with Keerti's
modifacation. Minimizing \f$ \frac{1}{2}\sum
y_iy_j\alpha_i\alpha_j(K_{ij}+\frac{1}{C_i}\delta_{ij}) - \sum
\alpha_i\f$, which corresponds to minimizing \f$ \sum
w_i^2+\sum C_i\xi_i^2 \f$.
@note If the training problem is not linearly separable and C
is set to infinity, the minima will be located in the infinity,
and thus the minimum will not be reached within the maximal
number of epochs. More exactly, when the problem is not
linearly separable, there exists an eigenvector to \f$
H_{ij}=y_iy_jK_{ij} \f$ within the space defined by the
conditions: \f$ \alpha_i>0 \f$ and \f$ \sum \alpha_i y_i = 0
\f$. As the eigenvalue is zero in this direction the quadratic
term does not contribute to the objective, but the objective
only consists of the linear term and hence there is no
minumum. This problem only occurs when \f$ C \f$ is set to
infinity because for a finite \f$ C \f$ all eigenvalues are
finite. However, for a large \f$ C \f$ (and training problem is
non-linearly separable) there exists an eigenvector
corresponding to a small eigenvalue, which means the minima has
moved from infinity to "very far away". In practice this will
also result in that the minima is not reached withing the
maximal number of epochs and the of \f$ C \f$ should be
decreased.
Class for SVM using Keerthi's second modification of Platt's
Sequential Minimal Optimization. The SVM uses all data given for
training.
\throw std::runtime_error if maximal number of epoch is reach.
*/
void train(const KernelLookup& kernel, const Target& target);
private:
///
/// Calculates bounds for alpha2
///
void bounds(double&, double&) const;
///
/// @brief calculates the bias term
///
/// @return true if successful
///
void calculate_bias(void);
///
/// Calculate margin that is inverse of w
///
void calculate_margin(void);
///
/// Private function choosing which two elements that should be
/// updated. First checking for the biggest violation (output - target =
/// 0) among support vectors (alpha!=0). If no violation was found check
/// sequentially among the other samples. If no violation there as
/// well training is completed
///
/// @return true if a pair of samples that violate the conditions
/// can be found
///
bool choose(const theplu::yat::utility::Vector&);
///
/// @return kernel modified with diagonal term (soft margin)
///
double kernel_mod(const size_t i, const size_t j) const;
///
/// @return 1 if i belong to binary target true else -1
///
int target(size_t i) const;
utility::Vector alpha_;
double bias_;
double C_inverse_;
// not owned by SVM
const KernelLookup* kernel_;
double margin_;
unsigned long int max_epochs_;
utility::Vector output_;
SVindex sample_;
Target target_;
double tolerance_;
bool trained_;
};
}}} // of namespace classifier, yat, and theplu
#endif