source: trunk/yat/utility/NNI.h @ 1726

Last change on this file since 1726 was 1726, checked in by Jari Häkkinen, 12 years ago

Fixes #476. Improved estimate() doc and added test on estimate() return value.

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 4.9 KB
Line 
1#ifndef _theplu_yat_utility_nni_
2#define _theplu_yat_utility_nni_
3
4// $Id: NNI.h 1726 2009-01-15 21:15:26Z jari $
5
6/*
7  Copyright (C) 2004 Jari Häkkinen
8  Copyright (C) 2005, 2006, 2007, 2008 Jari Häkkinen, Peter Johansson
9
10  This file is part of the yat library, http://dev.thep.lu.se/yat
11
12  The yat library is free software; you can redistribute it and/or
13  modify it under the terms of the GNU General Public License as
14  published by the Free Software Foundation; either version 3 of the
15  License, or (at your option) any later version.
16
17  The yat library is distributed in the hope that it will be useful,
18  but WITHOUT ANY WARRANTY; without even the implied warranty of
19  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20  General Public License for more details.
21
22  You should have received a copy of the GNU General Public License
23  along with yat. If not, see <http://www.gnu.org/licenses/>.
24*/
25
26#include "Matrix.h"
27
28#include <iostream>
29#include <utility>
30#include <vector>
31
32namespace theplu {
33namespace yat {
34namespace utility {
35
36  ///
37  /// @brief Interface class for nearest
38  /// neighbour imputation (NNI) algorithms.
39  ///
40  /// NNI algorithms implemented here is discussed in documents
41  /// created in the WeNNI project. This document will be released for
42  /// public access, and the necessary information for retrieving that
43  /// document will be provided here.
44  ///
45  /// Short introduction to NNI is that one may want to improve
46  /// (correct) uncertain data. Here, the data to be imputed is stored in a
47  /// matrix where rows similar to each other are used to adjust
48  /// uncertain data. The data matrix is accompanied by a weight
49  /// (uncertainty) matrix defining what data is to be considered as
50  /// 'certain' and what data is uncertain. The weight matrix can be
51  /// binary with 1's indicating that the data does not need
52  /// corrections, whereas a 0 means that the data should be replaced
53  /// by an imputed value. Naturally, the weight matrix can also be
54  /// continuous where values between 0 and 1 defines how certain a
55  /// data element is.
56  ///
57  /// The imputation depends on how similarity of rows of data is
58  /// defined and on the number of closest neighbours (here; rows) to
59  /// use in the imputation can be set.
60  ///
61  /// Implementation issues
62  ///
63  /// The current implementation treats rows where all data are tagged
64  /// are completely uncertain, i.e. all weights are zero, by
65  /// ignoring these lines in nearest neighbourhood
66  /// calculations. Importantly, this type of data are not changed
67  /// (imputed) either since there is no close neighbourhood defined
68  /// for this data.
69  ///
70  /// Rows that is completely identical in an imputation algorithm
71  /// sense will give problems since the distance between will usually
72  /// become zero. This is solved by setting zero distance to a small
73  /// number. Identical rows in this context are basically a
74  /// comparison between elements with non-zero uncertainty weights
75  /// only, and all these elements are equal. Zero weight elements are
76  /// not used in the comparison since these are considered as
77  /// non/sense values.
78  ///
79  class NNI
80  {
81  public:
82
83    ///
84    /// Base constructor for the nearest neighbour imputation
85    /// algorithms.
86    ///
87    NNI(const utility::Matrix& matrix,const utility::Matrix& weight,
88        const unsigned int neighbours);
89
90    virtual ~NNI(void) {};
91
92    /**
93       \brief Function doing the imputation.
94
95       The return value can be used as an indication of how well the
96       imputation worked. The return value should be zero if proper
97       pre-processing of data is done. An example of bad data is a
98       matrix with a column of zero weights, another is a
99       corresponding situation with a row with all weights zero.
100
101       \return The number of rows that have at least one value not
102       imputed.
103    */
104    virtual unsigned int estimate(void)=0;
105
106    ///
107    /// @return A const reference to the modified data.
108    ///
109    const utility::Matrix& imputed_data(void) const;
110
111    ///
112    /// @return indices of rows in data matrix not imputed
113    ///
114    const std::vector<size_t>& not_imputed(void) const;
115
116  protected:
117    /**
118       \f$ d_{ij}^2=\frac {\sum_{k=1}^C w_{ik} w_{jk} (x_{ik}-x_{jk})^2
119       }{\sum_{k=l}^C w_{ik} w_{jk} } \f$ where C is the number of columns
120    */
121    std::vector<std::pair<size_t,double> > 
122    calculate_distances(const size_t) const;
123    /// Contributing nearest neighbours are added up to the user set
124    /// number, and neighbours are disqualified if their element
125    /// (column) weight is zero
126    std::vector<size_t> 
127    nearest_neighbours(const size_t,
128                       const std::vector<std::pair<size_t,double> >&) const;
129    ///
130    /// original data matrix
131    ///
132    const utility::Matrix& data_;
133
134    ///
135    /// data after imputation
136    ///
137    utility::Matrix imputed_data_;
138
139    ///
140    /// number of neighbor to use
141    ///
142    unsigned int neighbours_;
143
144    ///
145    /// which rows are not imputed due to lack of data
146    ///
147    std::vector<size_t> not_imputed_;
148
149    ///
150    /// weight matrix
151    ///
152    const utility::Matrix& weight_;
153  };
154
155}}} // of namespace utility, yat, and theplu
156
157#endif
Note: See TracBrowser for help on using the repository browser.