source: trunk/yat/normalizer/qQuantileNormalizer.h @ 1771

Last change on this file since 1771 was 1771, checked in by Peter, 14 years ago

rescaling index - refs #478

  • Property svn:eol-style set to native
  • Property svn:keywords set to Id
File size: 10.6 KB
Line 
1#ifndef _theplu_yat_normalizer_qquantile_normalizer_
2#define _theplu_yat_normalizer_qquantile_normalizer_
3
4/*
5  Copyright (C) 2009 Jari Häkkinen, Peter Johansson
6
7  This file is part of the yat library, http://dev.thep.lu.se/yat
8
9  The yat library is free software; you can redistribute it and/or
10  modify it under the terms of the GNU General Public License as
11  published by the Free Software Foundation; either version 3 of the
12  License, or (at your option) any later version.
13
14  The yat library is distributed in the hope that it will be useful,
15  but WITHOUT ANY WARRANTY; without even the implied warranty of
16  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17  General Public License for more details.
18
19  You should have received a copy of the GNU General Public License
20  along with yat. If not, see <http://www.gnu.org/licenses/>.
21*/
22
23#include "yat/regression/CSplineInterpolation.h"
24#include "yat/utility/DataIterator.h"
25#include "yat/utility/DataWeight.h"
26#include "yat/utility/iterator_traits.h"
27#include "yat/utility/Vector.h"
28#include "yat/utility/WeightIterator.h"
29#include "yat/utility/yat_assert.h"
30
31#include <algorithm>
32#include <cmath>
33#include <iterator>
34#include <stdexcept>
35#include <vector>
36
37namespace theplu {
38namespace yat {
39namespace utility {
40  class VectorBase;
41}
42namespace normalizer {
43
44  /**
45     \brief Perform Q-quantile normalization
46
47     After a Q-quantile normalization each column has approximately
48     the same distribution of data (the Q-quantiles are the
49     same). Also, within each column the rank of an element is not
50     changed.
51
52     The normalization goes like this
53     - Data is not assumed to be sorted.
54     - Partition sorted target data in N parts. N must be 3 larger
55       because of requirements from the underlying cspline fit
56     - Calculate the arithmetic mean for each part, the mean is
57       assigned to the mid point of each part.
58     - Do the above for the data to be tranformed (called source
59       here).
60     - For each part, calculate the difference between the target and
61       the source. Now we have N differences d_i with associated rank
62       (midpoint of each part).
63     - Create a cubic spline fit to this difference vector d. The
64       resulting curve is used to recalculate all column values.
65       - Use the cubic spline fit for values within the cubic spline
66         fit range [midpoint 1st part, midpoint last part].
67       - For data outside the cubic spline fit use linear
68         extrapolation, i.e., a constant shift. d_first for points
69         below fit range, and d_last for points above fit range.
70
71     \since New in yat 0.5
72   */
73  class qQuantileNormalizer
74  {
75  public:
76    /**
77       \brief Documentation please.
78
79       \a Q is the number of parts and must be within \f$ [3,N] \f$
80       where \f$ N \f$ is the total number of data points in the
81       target. However, if \f$ N \f$ is larger than the number of points
82       in the data to be normalized the behaviour of the code is
83       undefined. Keep \f$ N \f$ equal to or less than the smallest
84       number of data points in the target or each data set to be
85       normalized against a given target. The lower bound of three is
86       due to restrictions in the cspline fit utilized in the
87       normalization.
88    */
89    template<typename BidirectionalIterator>
90    qQuantileNormalizer(BidirectionalIterator first, BidirectionalIterator last,
91                        unsigned int Q);
92
93    /**
94       \brief perform the Q-quantile normalization.
95
96       It is possible to normalize "in place"; it is permissible for
97       \a first and \a result to be the same.
98     */
99    template<typename RandomAccessIterator1, typename RandomAccessIterator2>
100    void operator()(RandomAccessIterator1 first, RandomAccessIterator1 last,
101                    RandomAccessIterator2 result) const;
102
103  private:
104
105  /**
106     \brief Partition a range of data into equal sizes.
107
108     The class also calculates the average of each part and assigns
109     the average to the mid point of each part. The midpoint is a
110     double, i.e., it is not forced to be an integer index.
111  */
112  class Partitioner
113  {
114  public:
115    /**
116       \brief Create the partition and perform required calculations.
117    */
118    template<typename BidirectionalIterator>
119    Partitioner(BidirectionalIterator first, BidirectionalIterator last, 
120                unsigned int N);
121
122    /**
123       \brief Return the averages for each part.
124
125       \return The average vector.
126    */
127    const utility::Vector& averages(void) const;
128
129    /**
130       \brief Return the mid point for each partition.
131
132       \return The index vector.
133    */
134    const utility::Vector& index(void) const;
135
136    /**
137       \return The number of parts.
138    */
139    size_t size(void) const;
140
141  private:
142    // unweighted "constructor"
143    template<typename Iterator>
144    void build(Iterator first, Iterator last, unsigned int N, 
145               utility::unweighted_iterator_tag);
146    // weighted "constructor"
147    template<typename Iterator>
148    void build(Iterator first, Iterator last, unsigned int N, 
149               utility::weighted_iterator_tag);
150    void init(const utility::VectorBase&, unsigned int N);
151    void init(const std::vector<utility::DataWeight>&, unsigned int N);
152
153    utility::Vector average_;
154    utility::Vector index_;
155  };
156
157
158    size_t N_target_;
159    Partitioner target_;
160
161    // unweighted version
162    template<typename RandomAccessIterator1, typename RandomAccessIterator2>
163    void normalize(const Partitioner& source,RandomAccessIterator1 first, 
164                   RandomAccessIterator1 last, RandomAccessIterator2 result,
165                   utility::unweighted_iterator_tag tag) const;
166
167    // weighted version
168    // copy weights and apply unweighted version on data part
169    template<typename RandomAccessIterator1, typename RandomAccessIterator2>
170    void normalize(const Partitioner& source,RandomAccessIterator1 first, 
171                   RandomAccessIterator1 last, RandomAccessIterator2 result,
172                   utility::weighted_iterator_tag tag) const;
173  };
174
175
176  // template implementations
177
178  template<typename BidirectionalIterator>
179  qQuantileNormalizer::qQuantileNormalizer(BidirectionalIterator first, 
180                                           BidirectionalIterator last,
181                                           unsigned int Q)
182    : N_target_(std::distance(first, last)), 
183      target_(Partitioner(first, last, Q))
184  {
185    utility::yat_assert<std::runtime_error>(Q>2, 
186                                            "qQuantileNormalizer: Q too small");
187  }
188
189
190  template<typename RandomAccessIterator1, typename RandomAccessIterator2>
191  void qQuantileNormalizer::operator()(RandomAccessIterator1 first, 
192                                       RandomAccessIterator1 last,
193                                       RandomAccessIterator2 result) const
194  {
195    Partitioner source(first, last, target_.size());
196    typename utility::weighted_iterator_traits<RandomAccessIterator2>::type tag;
197    normalize(source, first, last, result, tag);
198  }
199
200
201  template<typename RandomAccessIterator1, typename RandomAccessIterator2>
202  void 
203  qQuantileNormalizer::normalize(const qQuantileNormalizer::Partitioner& source,
204                                 RandomAccessIterator1 first, 
205                                 RandomAccessIterator1 last, 
206                                 RandomAccessIterator2 result,
207                                 utility::unweighted_iterator_tag tag) const
208  {
209    utility::check_iterator_is_unweighted(first);
210    utility::check_iterator_is_unweighted(result);
211    size_t N = last-first;
212    utility::yat_assert<std::runtime_error>
213      (N >= target_.size(), "qQuantileNormalizer: Input range too small");
214
215    std::vector<size_t> sorted_index(last-first);
216    utility::sort_index(first, last, sorted_index);
217
218    utility::Vector diff(source.averages());
219    diff-=target_.averages();
220    utility::Vector idx=target_.index();
221    // if N_target_ is different from N_source_ we need to rescale the
222    // idx so they stretch over same range
223    idx *= static_cast<double>(N)/static_cast<double>(N_target_);
224
225    regression::CSplineInterpolation cspline(idx,diff);
226
227    // linear extrapolation for first part, i.e., use first diff for
228    // all points in the first part.
229    size_t start=0;
230    size_t end=static_cast<unsigned int>(std::ceil(idx(0)));
231    // take care of limiting case number of parts approximately equal
232    // to the number of elements in range.
233    if (end==0)
234      ++end;
235    for (size_t row=start; row<end; ++row) {
236      size_t srow=sorted_index[row];
237      result[srow] = first[srow] - diff(0);
238    }
239   
240    // cspline interpolation for all data between the mid points of
241    // the first and last part
242    start=end;
243    end=static_cast<unsigned int>(idx(target_.size()-1));
244    // take care of limiting case number of parts approximately
245    // equal to the number of matrix rows
246    if (end==(static_cast<size_t>(N-1)) )
247      --end;
248    for (size_t row=start; row<=end; ++row) {
249      size_t srow=sorted_index[row];
250      result[srow] = first[srow] - cspline.evaluate(row) ;
251    }
252   
253    // linear extrapolation for last part, i.e., use last diff for
254    // all points in the last part.
255    start=end+1;
256    end=N;
257    for (size_t row=start; row<end; ++row) {
258      size_t srow=sorted_index[row];
259      result[srow] = first[srow] - diff(diff.size()-1);
260    }
261  }
262
263
264  template<typename RandomAccessIterator1, typename RandomAccessIterator2>
265  void qQuantileNormalizer::normalize(const Partitioner& source,
266                                      RandomAccessIterator1 first, 
267                                      RandomAccessIterator1 last, 
268                                      RandomAccessIterator2 result,
269                                      utility::weighted_iterator_tag tag) const
270  {
271    // copy the weights
272    std::copy(utility::weight_iterator(first), 
273              utility::weight_iterator(last), 
274              utility::weight_iterator(result)); 
275    // apply algorithm on data part of range
276    normalize(source, utility::data_iterator(first),
277              utility::data_iterator(last),
278              utility::data_iterator(result),
279              utility::unweighted_iterator_tag());
280  }
281
282
283  template<typename BidirectionalIterator>
284  qQuantileNormalizer::Partitioner::Partitioner(BidirectionalIterator first, 
285                                                BidirectionalIterator last, 
286                                                unsigned int N)
287    : average_(utility::Vector(N)), index_(utility::Vector(N))
288  {
289    typedef typename 
290      utility::weighted_iterator_traits<BidirectionalIterator>::type tag;
291    build(first, last, N, tag());
292         
293  }
294
295
296  template<typename Iterator>
297  void qQuantileNormalizer::Partitioner::build(Iterator first, Iterator last, 
298                                               unsigned int N, 
299                                               utility::unweighted_iterator_tag)
300  {
301    utility::Vector vec(std::distance(first, last));
302    std::copy(first, last, vec.begin());
303    std::sort(vec.begin(), vec.end());
304    init(vec, N);
305  }
306
307
308  template<typename Iterator>
309  void qQuantileNormalizer::Partitioner::build(Iterator first, 
310                                               Iterator last, unsigned int N, 
311                                               utility::weighted_iterator_tag)
312  {
313    std::vector<utility::DataWeight> vec;
314    vec.reserve(std::distance(first, last));
315    std::back_insert_iterator<std::vector<utility::DataWeight> > inserter(vec);
316    std::copy(first, last, inserter);
317    std::sort(vec.begin(), vec.end());
318    init(vec, N);
319  }
320
321
322}}} // end of namespace normalizer, yat and thep
323
324#endif
Note: See TracBrowser for help on using the repository browser.