source: trunk/yat/normalizer/qQuantileNormalizer.h @ 1768

Last change on this file since 1768 was 1768, checked in by Jari Häkkinen, 12 years ago

The ends are extrapolated not interpolated.

  • Property svn:eol-style set to native
  • Property svn:keywords set to Id
File size: 10.4 KB
Line 
1#ifndef _theplu_yat_normalizer_qquantile_normalizer_
2#define _theplu_yat_normalizer_qquantile_normalizer_
3
4/*
5  Copyright (C) 2009 Jari Häkkinen, Peter Johansson
6
7  This file is part of the yat library, http://dev.thep.lu.se/yat
8
9  The yat library is free software; you can redistribute it and/or
10  modify it under the terms of the GNU General Public License as
11  published by the Free Software Foundation; either version 3 of the
12  License, or (at your option) any later version.
13
14  The yat library is distributed in the hope that it will be useful,
15  but WITHOUT ANY WARRANTY; without even the implied warranty of
16  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17  General Public License for more details.
18
19  You should have received a copy of the GNU General Public License
20  along with yat. If not, see <http://www.gnu.org/licenses/>.
21*/
22
23#include "yat/regression/CSplineInterpolation.h"
24#include "yat/utility/DataIterator.h"
25#include "yat/utility/DataWeight.h"
26#include "yat/utility/iterator_traits.h"
27#include "yat/utility/Vector.h"
28#include "yat/utility/WeightIterator.h"
29#include "yat/utility/yat_assert.h"
30
31#include <algorithm>
32#include <cmath>
33#include <iterator>
34#include <stdexcept>
35#include <vector>
36
37namespace theplu {
38namespace yat {
39namespace utility {
40  class VectorBase;
41}
42namespace normalizer {
43
44  /**
45     \brief Perform Q-quantile normalization
46
47     After a Q-quantile normalization each column has approximately
48     the same distribution of data (the Q-quantiles are the
49     same). Also, within each column the rank of an element is not
50     changed.
51
52     The normalization goes like this
53     - Data is not assumed to be sorted.
54     - Partition sorted target data in N parts. N must be 3 larger
55       because of requirements from the underlying cspline fit
56     - Calculate the arithmetic mean for each part, the mean is
57       assigned to the mid point of each part.
58     - Do the above for the data to be tranformed (called source
59       here).
60     - For each part, calculate the difference between the target and
61       the source. Now we have N differences d_i with associated rank
62       (midpoint of each part).
63     - Create a cubic spline fit to this difference vector d. The
64       resulting curve is used to recalculate all column values.
65       - Use the cubic spline fit for values within the cubic spline
66         fit range [midpoint 1st part, midpoint last part].
67       - For data outside the cubic spline fit use linear
68         extrapolation, i.e., a constant shift. d_first for points
69         below fit range, and d_last for points above fit range.
70
71     \since New in yat 0.5
72   */
73  class qQuantileNormalizer
74  {
75  public:
76    /**
77       \brief Documentation please.
78
79       \a Q is the number of parts and must be within \f$ [3,N] \f$
80       where \f$ N \f$ is the total number of data points in the
81       target. However, if \f$ N \f$ is larger than the number of points
82       in the data to be normalized the behaviour of the code is
83       undefined. Keep \f$ N \f$ equal to or less than the smallest
84       number of data points in the target or each data set to be
85       normalized against a given target. The lower bound of three is
86       due to restrictions in the cspline fit utilized in the
87       normalization.
88    */
89    template<typename BidirectionalIterator>
90    qQuantileNormalizer(BidirectionalIterator first, BidirectionalIterator last,
91                        unsigned int Q);
92
93    /**
94       \brief perform the Q-quantile normalization.
95
96       It is possible to normalize "in place"; it is permissible for
97       \a first and \a result to be the same.
98     */
99    template<typename RandomAccessIterator1, typename RandomAccessIterator2>
100    void operator()(RandomAccessIterator1 first, RandomAccessIterator1 last,
101                    RandomAccessIterator2 result) const;
102
103  private:
104
105  /**
106     \brief Partition a range of data into equal sizes.
107
108     The class also calculates the average of each part and assigns
109     the average to the mid point of each part. The midpoint is a
110     double, i.e., it is not forced to be an integer index.
111  */
112  class Partitioner
113  {
114  public:
115    /**
116       \brief Create the partition and perform required calculations.
117    */
118    template<typename BidirectionalIterator>
119    Partitioner(BidirectionalIterator first, BidirectionalIterator last, 
120                unsigned int N);
121
122    /**
123       \brief Return the averages for each part.
124
125       \return The average vector.
126    */
127    const utility::Vector& averages(void) const;
128
129    /**
130       \brief Return the mid point for each partition.
131
132       \return The index vector.
133    */
134    const utility::Vector& index(void) const;
135
136    /**
137       \return The number of parts.
138    */
139    size_t size(void) const;
140
141  private:
142    // unweighted "constructor"
143    template<typename Iterator>
144    void build(Iterator first, Iterator last, unsigned int N, 
145               utility::unweighted_iterator_tag);
146    // weighted "constructor"
147    template<typename Iterator>
148    void build(Iterator first, Iterator last, unsigned int N, 
149               utility::weighted_iterator_tag);
150    void init(const utility::VectorBase&, unsigned int N);
151    void init(const std::vector<utility::DataWeight>&, unsigned int N);
152
153    utility::Vector average_;
154    utility::Vector index_;
155  };
156
157
158    Partitioner target_;
159
160    // unweighted version
161    template<typename RandomAccessIterator1, typename RandomAccessIterator2>
162    void normalize(const Partitioner& source,RandomAccessIterator1 first, 
163                   RandomAccessIterator1 last, RandomAccessIterator2 result,
164                   utility::unweighted_iterator_tag tag) const;
165
166    // weighted version
167    // copy weights and apply unweighted version on data part
168    template<typename RandomAccessIterator1, typename RandomAccessIterator2>
169    void normalize(const Partitioner& source,RandomAccessIterator1 first, 
170                   RandomAccessIterator1 last, RandomAccessIterator2 result,
171                   utility::weighted_iterator_tag tag) const;
172  };
173
174
175  // template implementations
176
177  template<typename BidirectionalIterator>
178  qQuantileNormalizer::qQuantileNormalizer(BidirectionalIterator first, 
179                                           BidirectionalIterator last,
180                                           unsigned int Q)
181    : target_(Partitioner(first, last, Q))
182  {
183    utility::yat_assert<std::runtime_error>(Q>2, 
184                                            "qQuantileNormalizer: Q too small");
185  }
186
187
188  template<typename RandomAccessIterator1, typename RandomAccessIterator2>
189  void qQuantileNormalizer::operator()(RandomAccessIterator1 first, 
190                                       RandomAccessIterator1 last,
191                                       RandomAccessIterator2 result) const
192  {
193    Partitioner source(first, last, target_.size());
194    typename utility::weighted_iterator_traits<RandomAccessIterator2>::type tag;
195    normalize(source, first, last, result, tag);
196  }
197
198
199  template<typename RandomAccessIterator1, typename RandomAccessIterator2>
200  void 
201  qQuantileNormalizer::normalize(const qQuantileNormalizer::Partitioner& source,
202                                 RandomAccessIterator1 first, 
203                                 RandomAccessIterator1 last, 
204                                 RandomAccessIterator2 result,
205                                 utility::unweighted_iterator_tag tag) const
206  {
207    utility::check_iterator_is_unweighted(first);
208    utility::check_iterator_is_unweighted(result);
209    size_t N = last-first;
210    utility::yat_assert<std::runtime_error>
211      (N >= target_.size(), "qQuantileNormalizer: Input range too small");
212
213    std::vector<size_t> sorted_index(last-first);
214    utility::sort_index(first, last, sorted_index);
215
216    utility::Vector diff(source.averages());
217    diff-=target_.averages();
218    const utility::Vector& idx=target_.index();
219    regression::CSplineInterpolation cspline(idx,diff);
220
221    // linear extrapolation for first part, i.e., use first diff for
222    // all points in the first part.
223    size_t start=0;
224    size_t end=static_cast<unsigned int>(std::ceil(idx(0)));
225    // take care of limiting case number of parts approximately equal
226    // to the number of elements in range.
227    if (end==0)
228      ++end;
229    for (size_t row=start; row<end; ++row) {
230      size_t srow=sorted_index[row];
231      result[srow] = first[srow] - diff(0);
232    }
233   
234    // cspline interpolation for all data between the mid points of
235    // the first and last part
236    start=end;
237    end=static_cast<unsigned int>(idx(target_.size()-1));
238    // take care of limiting case number of parts approximately
239    // equal to the number of matrix rows
240    if (end==(static_cast<size_t>(N-1)) )
241      --end;
242    for (size_t row=start; row<=end; ++row) {
243      size_t srow=sorted_index[row];
244      result[srow] = first[srow] - cspline.evaluate(row) ;
245    }
246   
247    // linear extrapolation for last part, i.e., use last diff for
248    // all points in the last part.
249    start=end+1;
250    end=N;
251    for (size_t row=start; row<end; ++row) {
252      size_t srow=sorted_index[row];
253      result[srow] = first[srow] - diff(diff.size()-1);
254    }
255  }
256
257
258  template<typename RandomAccessIterator1, typename RandomAccessIterator2>
259  void qQuantileNormalizer::normalize(const Partitioner& source,
260                                      RandomAccessIterator1 first, 
261                                      RandomAccessIterator1 last, 
262                                      RandomAccessIterator2 result,
263                                      utility::weighted_iterator_tag tag) const
264  {
265    // copy the weights
266    std::copy(utility::weight_iterator(first), 
267              utility::weight_iterator(last), 
268              utility::weight_iterator(result)); 
269    // apply algorithm on data part of range
270    normalize(source, utility::data_iterator(first),
271              utility::data_iterator(last),
272              utility::data_iterator(result),
273              utility::unweighted_iterator_tag());
274  }
275
276
277  template<typename BidirectionalIterator>
278  qQuantileNormalizer::Partitioner::Partitioner(BidirectionalIterator first, 
279                                                BidirectionalIterator last, 
280                                                unsigned int N)
281    : average_(utility::Vector(N)), index_(utility::Vector(N))
282  {
283    typedef typename 
284      utility::weighted_iterator_traits<BidirectionalIterator>::type tag;
285    build(first, last, N, tag());
286         
287  }
288
289
290  template<typename Iterator>
291  void qQuantileNormalizer::Partitioner::build(Iterator first, Iterator last, 
292                                               unsigned int N, 
293                                               utility::unweighted_iterator_tag)
294  {
295    utility::Vector vec(std::distance(first, last));
296    std::copy(first, last, vec.begin());
297    std::sort(vec.begin(), vec.end());
298    init(vec, N);
299  }
300
301
302  template<typename Iterator>
303  void qQuantileNormalizer::Partitioner::build(Iterator first, 
304                                               Iterator last, unsigned int N, 
305                                               utility::weighted_iterator_tag)
306  {
307    std::vector<utility::DataWeight> vec;
308    vec.reserve(std::distance(first, last));
309    std::back_insert_iterator<std::vector<utility::DataWeight> > inserter(vec);
310    std::copy(first, last, inserter);
311    std::sort(vec.begin(), vec.end());
312    init(vec, N);
313  }
314
315
316}}} // end of namespace normalizer, yat and thep
317
318#endif
Note: See TracBrowser for help on using the repository browser.