source: trunk/yat/normalizer/qQuantileNormalizer.h @ 1740

Last change on this file since 1740 was 1740, checked in by Peter, 12 years ago

addresses #478 - implemented weighted normalization. Need to add test especially to test that the templates compile.

  • Property svn:eol-style set to native
  • Property svn:keywords set to Id
File size: 10.8 KB
Line 
1#ifndef _theplu_yat_normalizer_qquantile_normalizer_
2#define _theplu_yat_normalizer_qquantile_normalizer_
3
4/*
5  Copyright (C) 2009 Jari Häkkinen, Peter Johansson
6
7  This file is part of the yat library, http://dev.thep.lu.se/yat
8
9  The yat library is free software; you can redistribute it and/or
10  modify it under the terms of the GNU General Public License as
11  published by the Free Software Foundation; either version 3 of the
12  License, or (at your option) any later version.
13
14  The yat library is distributed in the hope that it will be useful,
15  but WITHOUT ANY WARRANTY; without even the implied warranty of
16  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17  General Public License for more details.
18
19  You should have received a copy of the GNU General Public License
20  along with yat. If not, see <http://www.gnu.org/licenses/>.
21*/
22
23#include "yat/regression/CSplineInterpolation.h"
24#include "yat/utility/DataIterator.h"
25#include "yat/utility/DataWeight.h"
26#include "yat/utility/iterator_traits.h"
27#include "yat/utility/Vector.h"
28#include "yat/utility/WeightIterator.h"
29#include "yat/utility/yat_assert.h"
30
31#include <algorithm>
32#include <iterator>
33#include <stdexcept>
34#include <vector>
35
36namespace theplu {
37namespace yat {
38namespace utility {
39  class VectorBase;
40}
41namespace normalizer {
42
43  /**
44     \brief Perform Q-quantile normalization
45
46     After a Q-quantile normalization each column has approximately
47     the same distribution of data (the Q-quantiles are the
48     same). Also, within each column the rank of an element is not
49     changed.
50
51     There is currently no weighted version of qQuantileNormalizer
52
53     The normalization goes like this
54     - Data is not assumed to be sorted.
55     - Partition sorted target data in N parts. N must be 3 larger
56       because of requirements from the underlying cspline fit
57     - Calculate the arithmetic mean for each part, the mean is
58       assigned to the mid point of each part.
59     - Do the above for the data to be tranformed (called source
60       here).
61     - For each part, calculate the difference between the target and
62       the source. Now we have N differences d_i with associated rank
63       (midpoint of each part).
64     - Create a cubic spline fit to this difference vector d. The
65       resulting curve is used to recalculate all column values.
66       - Use the cubic spline fit for values within the cubic spline
67         fit range [midpoint 1st part, midpoint last part].
68       - For data outside the cubic spline fit use linear
69         extrapolation, i.e., a constant shift. d_first for points
70         below fit range, and d_last for points above fit range.
71
72     \since New in yat 0.5
73   */
74  class qQuantileNormalizer
75  {
76  public:
77    /**
78       \brief Documentation please.
79
80       \a Q is the number of parts and must be within \f$ [3,N] \f$
81       where \f$ N \f$ is the total number of data points in the
82       target. However, if \f$ N \f$ is larger than the number of points
83       in the data to be normalized the behaviour of the code is
84       undefined. Keep \f$ N \f$ equal to or less than the smallest
85       number of data points in the target or each data set to be
86       normalized against a given target. The lower bound of three is
87       due to restrictions in the cspline fit utilized in the
88       normalization.
89    */
90    template<typename BidirectionalIterator>
91    qQuantileNormalizer(BidirectionalIterator first, BidirectionalIterator last,
92                        unsigned int Q);
93
94    /**
95       \brief perform the Q-quantile normalization.
96
97       It is possible to normalize "in place"; it is permissible for
98       \a matrix and \a result to reference the same Matrix.
99
100       \note dimensions of \a matrix and \a result must match.
101     */
102    template<typename RandomAccessIterator1, typename RandomAccessIterator2>
103    void operator()(RandomAccessIterator1 first, RandomAccessIterator1 last,
104                    RandomAccessIterator2 result) const;
105
106  private:
107
108  /**
109     \brief Partition a vector of data into equal sizes.
110
111     The class also calculates the average of each part and assigns
112     the average to the mid point of each part. The midpoint is a
113     double, i.e., it is not forced to be an integer index.
114  */
115  class Partitioner
116  {
117  public:
118    /**
119       \brief Create the partition and perform required calculations.
120    */
121    template<typename BidirectionalIterator>
122    Partitioner(BidirectionalIterator first, BidirectionalIterator last, 
123                unsigned int N);
124
125    /**
126       \brief Return the averages for each part.
127
128       \return The average vector.
129    */
130    const utility::Vector& averages(void) const;
131
132    /**
133       \brief Return the mid point for each partition.
134
135       \return The index vector.
136    */
137    const utility::Vector& index(void) const;
138
139    /**
140       \return The number of parts.
141    */
142    size_t size(void) const;
143
144  private:
145    // unweighted "constructor"
146    template<typename Iterator>
147    void build(Iterator first, Iterator last, unsigned int N, 
148               utility::unweighted_iterator_tag);
149    // weighted "constructor"
150    template<typename Iterator>
151    void build(Iterator first, Iterator last, unsigned int N, 
152               utility::weighted_iterator_tag);
153    void init(const utility::VectorBase&, unsigned int N);
154    void init(const std::vector<utility::DataWeight>&, unsigned int N);
155
156    utility::Vector average_;
157    utility::Vector index_;
158  };
159
160
161    Partitioner target_;
162
163    // unweighted version
164    template<typename RandomAccessIterator1, typename RandomAccessIterator2>
165    void normalize(const Partitioner& source,RandomAccessIterator1 first, 
166                   RandomAccessIterator1 last, RandomAccessIterator2 result,
167                   utility::unweighted_iterator_tag tag) const;
168
169    // weighted version
170    // copy weights and apply unweighted version on data part
171    template<typename RandomAccessIterator1, typename RandomAccessIterator2>
172    void normalize(const Partitioner& source,RandomAccessIterator1 first, 
173                   RandomAccessIterator1 last, RandomAccessIterator2 result,
174                   utility::weighted_iterator_tag tag) const;
175  };
176
177
178  // template implementations
179
180  template<typename BidirectionalIterator>
181  qQuantileNormalizer::qQuantileNormalizer(BidirectionalIterator first, 
182                                           BidirectionalIterator last,
183                                           unsigned int Q)
184    : target_(Partitioner(first, last, Q))
185  {
186    utility::yat_assert<std::runtime_error>(Q>2, 
187                                            "qQuantileNormalizer: Q too small");
188  }
189
190
191  template<typename RandomAccessIterator1, typename RandomAccessIterator2>
192  void qQuantileNormalizer::operator()(RandomAccessIterator1 first, 
193                                       RandomAccessIterator1 last,
194                                       RandomAccessIterator2 result) const
195  {
196    Partitioner source(first, last, target_.size());
197    typename utility::weighted_iterator_traits<RandomAccessIterator2>::type tag;
198    normalize(source, first, last, result, tag);
199  }
200
201
202  template<typename RandomAccessIterator1, typename RandomAccessIterator2>
203  void 
204  qQuantileNormalizer::normalize(const qQuantileNormalizer::Partitioner& source,
205                                 RandomAccessIterator1 first, 
206                                 RandomAccessIterator1 last, 
207                                 RandomAccessIterator2 result,
208                                 utility::unweighted_iterator_tag tag) const
209  {
210    size_t N = last-first;
211    utility::yat_assert<std::runtime_error>
212      (N >= target_.size(), "qQuantileNormalizer: Input range too small");
213
214    std::vector<size_t> sorted_index(last-first);
215    utility::sort_index(first, last, sorted_index);
216
217    utility::Vector diff(source.averages());
218    diff-=target_.averages();
219    const utility::Vector& idx=target_.index();
220    regression::CSplineInterpolation cspline(idx,diff);
221
222    // linear interpolation for first part, i.e., use first diff for
223    // all points in the first part.
224    size_t start=0;
225    size_t end=static_cast<unsigned int>(idx(0));
226    // The first condition below takes care of limiting case number
227    // of parts approximately equal to the number of matrix rows and
228    // the second condition makes sure that index is large enough
229    // when using cspline below ... the static cast above takes the
230    // floor whereas we want to take the "roof" forcing next index
231    // range to be within interpolation range for the cspline.
232    if ((end==0) || (end<idx(0)))
233      ++end;
234    for (size_t row=start; row<end; ++row) {
235      size_t srow=sorted_index[row];
236      result[srow] = first[srow] - diff(0);
237    }
238   
239    // cspline interpolation for all data between the mid points of
240    // the first and last part
241    start=end;
242    end=static_cast<unsigned int>(idx(target_.size()-1));
243    // take care of limiting case number of parts approximately
244    // equal to the number of matrix rows
245    if (end==(static_cast<size_t>(N-1)) )
246      --end;
247    for (size_t row=start; row<=end; ++row) {
248      size_t srow=sorted_index[row];
249      result[srow] = first[srow] - cspline.evaluate(row) ;
250    }
251   
252    // linear interpolation for last part, i.e., use last diff for
253    // all points in the last part.
254    start=end+1;
255    end=N;
256    for (size_t row=start; row<end; ++row) {
257      size_t srow=sorted_index[row];
258      result[srow] = first[srow] - diff(diff.size()-1);
259    }
260  }
261
262
263  template<typename RandomAccessIterator1, typename RandomAccessIterator2>
264  void qQuantileNormalizer::normalize(const Partitioner& source,
265                                      RandomAccessIterator1 first, 
266                                      RandomAccessIterator1 last, 
267                                      RandomAccessIterator2 result,
268                                      utility::weighted_iterator_tag tag) const
269  {
270    // copy the weights
271    std::copy(utility::weight_iterator<RandomAccessIterator1>(first), 
272              utility::weight_iterator<RandomAccessIterator1>(last), 
273              utility::weight_iterator<RandomAccessIterator2>(result)); 
274    // apply algorithm on data part of range
275    normalize(source, utility::data_iterator<RandomAccessIterator1>(first),
276              utility::data_iterator<RandomAccessIterator1>(last),
277              utility::data_iterator<RandomAccessIterator2>(result),
278              utility::unweighted_iterator_tag());
279  }
280
281
282  template<typename BidirectionalIterator>
283  qQuantileNormalizer::Partitioner::Partitioner(BidirectionalIterator first, 
284                                                BidirectionalIterator last, 
285                                                unsigned int N)
286    : average_(utility::Vector(N)), index_(utility::Vector(N))
287  {
288    typedef typename 
289      utility::weighted_iterator_traits<BidirectionalIterator>::type tag;
290    build(first, last, N, tag());
291         
292  }
293
294
295  template<typename Iterator>
296  void qQuantileNormalizer::Partitioner::build(Iterator first, Iterator last, 
297                                               unsigned int N, 
298                                               utility::unweighted_iterator_tag)
299  {
300    utility::Vector vec(std::distance(first, last));
301    std::copy(first, last, vec.begin());
302    std::sort(vec.begin(), vec.end());
303    init(vec, N);
304  }
305
306
307  template<typename Iterator>
308  void qQuantileNormalizer::Partitioner::build(Iterator first, 
309                                               Iterator last, unsigned int N, 
310                                               utility::weighted_iterator_tag)
311  {
312    std::vector<utility::DataWeight> vec;
313    vec.reserve(std::distance(first, last));
314    std::back_insert_iterator<std::vector<utility::DataWeight> > inserter(vec);
315    std::copy(first, last, inserter);
316    std::sort(vec.begin(), vec.end());
317    init(vec, N);
318  }
319
320
321}}} // end of namespace normalizer, yat and thep
322
323#endif
Note: See TracBrowser for help on using the repository browser.