#ifndef _theplu_yat_utility_nni_
#define _theplu_yat_utility_nni_
// $Id: NNI.h 1487 2008-09-10 08:41:36Z jari $
/*
Copyright (C) 2004 Jari Häkkinen
Copyright (C) 2005, 2006, 2007, 2008 Jari Häkkinen, Peter Johansson
This file is part of the yat library, http://dev.thep.lu.se/yat
The yat library is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License as
published by the Free Software Foundation; either version 3 of the
License, or (at your option) any later version.
The yat library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
General Public License for more details.
You should have received a copy of the GNU General Public License
along with yat. If not, see .
*/
#include "Matrix.h"
#include
#include
#include
namespace theplu {
namespace yat {
namespace utility {
///
/// @brief Interface class for nearest
/// neighbour imputation (NNI) algorithms.
///
/// NNI algorithms implemented here is discussed in documents
/// created in the WeNNI project. This document will be released for
/// public access, and the necessary information for retrieving that
/// document will be provided here.
///
/// Short introduction to NNI is that one may want to improve
/// (correct) uncertain data. Here, the data to be imputed is stored in a
/// matrix where rows similar to each other are used to adjust
/// uncertain data. The data matrix is accompanied by a weight
/// (uncertainty) matrix defining what data is to be considered as
/// 'certain' and what data is uncertain. The weight matrix can be
/// binary with 1's indicating that the data does not need
/// corrections, whereas a 0 means that the data should be replaced
/// by an imputed value. Naturally, the weight matrix can also be
/// continuous where values between 0 and 1 defines how certain a
/// data element is.
///
/// The imputation depends on how similarity of rows of data is
/// defined and on the number of closest neighbours (here; rows) to
/// use in the imputation can be set.
///
/// Implementation issues
///
/// The current implementation treats rows where all data are tagged
/// are completely uncertain, i.e. all weights are zero, by
/// ignoring these lines in nearest neighbourhood
/// calculations. Importantly, this type of data are not changed
/// (imputed) either since there is no close neighbourhood defined
/// for this data.
///
/// Rows that is completely identical in an imputation algorithm
/// sense will give problems since the distance between will usually
/// become zero. This is solved by setting zero distance to a small
/// number. Identical rows in this context are basically a
/// comparison between elements with non-zero uncertainty weights
/// only, and all these elements are equal. Zero weight elements are
/// not used in the comparison since these are considered as
/// non/sense values.
///
class NNI
{
public:
///
/// Base constructor for the nearest neighbour imputation
/// algorithms.
///
NNI(const utility::Matrix& matrix,const utility::Matrix& weight,
const unsigned int neighbours);
virtual ~NNI(void) {};
///
/// Function doing the imputation.
///
/// @return number of rows not imputed
///
virtual unsigned int estimate(void)=0;
///
/// @return A const reference to the modified data.
///
const utility::Matrix& imputed_data(void) const;
///
/// @return indices of rows in data matrix not imputed
///
const std::vector& not_imputed(void) const;
protected:
/**
\f$ d_{ij}^2=\frac {\sum_{k=1}^C w_{ik} w_{jk} (x_{ik}-x_{jk})^2
}{\sum_{k=l}^C w_{ik} w_{jk} } \f$ where C is the number of columns
*/
std::vector >
calculate_distances(const size_t) const;
/// Contributing nearest neighbours are added up to the user set
/// number, and neighbours are disqualified if their element
/// (column) weight is zero
std::vector
nearest_neighbours(const size_t,
const std::vector >&) const;
///
/// original data matrix
///
const utility::Matrix& data_;
///
/// data after imputation
///
utility::Matrix imputed_data_;
///
/// number of neighbor to use
///
unsigned int neighbours_;
///
/// which rows are not imputed due to lack of data
///
std::vector not_imputed_;
///
/// weight matrix
///
const utility::Matrix& weight_;
};
}}} // of namespace utility, yat, and theplu
#endif