source: trunk/yat/normalizer/qQuantileNormalizer.h @ 1803

Last change on this file since 1803 was 1803, checked in by Peter, 14 years ago

fixed typo - refs #478

  • Property svn:eol-style set to native
  • Property svn:keywords set to Id
File size: 11.5 KB
Line 
1#ifndef _theplu_yat_normalizer_qquantile_normalizer_
2#define _theplu_yat_normalizer_qquantile_normalizer_
3
4/*
5  Copyright (C) 2009 Jari Häkkinen, Peter Johansson
6
7  This file is part of the yat library, http://dev.thep.lu.se/yat
8
9  The yat library is free software; you can redistribute it and/or
10  modify it under the terms of the GNU General Public License as
11  published by the Free Software Foundation; either version 3 of the
12  License, or (at your option) any later version.
13
14  The yat library is distributed in the hope that it will be useful,
15  but WITHOUT ANY WARRANTY; without even the implied warranty of
16  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17  General Public License for more details.
18
19  You should have received a copy of the GNU General Public License
20  along with yat. If not, see <http://www.gnu.org/licenses/>.
21*/
22
23#include "yat/regression/CSplineInterpolation.h"
24#include "yat/utility/DataIterator.h"
25#include "yat/utility/DataWeight.h"
26#include "yat/utility/iterator_traits.h"
27#include "yat/utility/sort_index.h"
28#include "yat/utility/Vector.h"
29#include "yat/utility/WeightIterator.h"
30#include "yat/utility/yat_assert.h"
31
32#include <algorithm>
33#include <cmath>
34#include <iterator>
35#include <numeric>
36#include <stdexcept>
37#include <vector>
38
39namespace theplu {
40namespace yat {
41namespace utility {
42  class VectorBase;
43}
44namespace normalizer {
45
46  /**
47     \brief Perform Q-quantile normalization
48
49     After a Q-quantile normalization each column has approximately
50     the same distribution of data (the Q-quantiles are the
51     same). Also, within each column the rank of an element is not
52     changed.
53
54     The normalization goes like this
55     - Data is not assumed to be sorted.
56     - Partition sorted target data in N parts. N must be 3 larger
57       because of requirements from the underlying cspline fit
58     - Calculate the arithmetic mean for each part, the mean is
59       assigned to the mid point of each part.
60     - Do the above for the data to be tranformed (called source
61       here).
62     - For each part, calculate the difference between the target and
63       the source. Now we have N differences d_i with associated rank
64       (midpoint of each part).
65     - Create a cubic spline fit to this difference vector d. The
66       resulting curve is used to recalculate all column values.
67       - Use the cubic spline fit for values within the cubic spline
68         fit range [midpoint 1st part, midpoint last part].
69       - For data outside the cubic spline fit use linear
70         extrapolation, i.e., a constant shift. d_first for points
71         below fit range, and d_last for points above fit range.
72
73     \since New in yat 0.5
74   */
75  class qQuantileNormalizer
76  {
77  public:
78    /**
79       \brief Documentation please.
80
81       \a Q is the number of parts and must be within \f$ [3,N] \f$
82       where \f$ N \f$ is the total number of data points in the
83       target. However, if \f$ N \f$ is larger than the number of points
84       in the data to be normalized the behaviour of the code is
85       undefined. Keep \f$ N \f$ equal to or less than the smallest
86       number of data points in the target or each data set to be
87       normalized against a given target. The lower bound of three is
88       due to restrictions in the cspline fit utilized in the
89       normalization.
90    */
91    template<typename ForwardIterator>
92    qQuantileNormalizer(ForwardIterator first, ForwardIterator last,
93                        unsigned int Q);
94
95    /**
96       \brief perform the Q-quantile normalization.
97
98       It is possible to normalize "in place"; it is permissible for
99       \a first and \a result to be the same.
100     */
101    template<typename RandomAccessIterator1, typename RandomAccessIterator2>
102    void operator()(RandomAccessIterator1 first, RandomAccessIterator1 last,
103                    RandomAccessIterator2 result) const;
104
105  private:
106
107  /**
108     \brief Partition a range of data into equal sizes.
109
110     The class also calculates the average of each part and assigns
111     the average to the mid point of each part. The midpoint is a
112     double, i.e., it is not forced to be an integer index.
113  */
114  class Partitioner
115  {
116  public:
117    /**
118       \brief Create the partition and perform required calculations.
119    */
120    template<typename ForwardIterator>
121    Partitioner(ForwardIterator first, ForwardIterator last, 
122                unsigned int N);
123
124    /**
125       \brief Return the averages for each part.
126
127       \return The average vector.
128    */
129    const utility::Vector& averages(void) const;
130
131    /**
132       \brief Return the mid point for each partition.
133
134       \return The index vector.
135    */
136    const utility::Vector& index(void) const;
137
138    /**
139       \return The number of parts.
140    */
141    size_t size(void) const;
142
143  private:
144    // unweighted "constructor"
145    template<typename ForwardIterator>
146    void build(ForwardIterator first, ForwardIterator last, unsigned int N, 
147               utility::unweighted_iterator_tag);
148    // weighted "constructor"
149    template<typename ForwardIterator>
150    void build(ForwardIterator first, ForwardIterator last, unsigned int N, 
151               utility::weighted_iterator_tag);
152    void init(const utility::VectorBase&, unsigned int N);
153    void init(const std::vector<utility::DataWeight>&, unsigned int N);
154
155    utility::Vector average_;
156    utility::Vector index_;
157  };
158
159    Partitioner target_;
160
161    // unweighted version
162    template<typename RandomAccessIterator1, typename RandomAccessIterator2>
163    void normalize(const Partitioner& source,RandomAccessIterator1 first, 
164                   RandomAccessIterator1 last, RandomAccessIterator2 result,
165                   utility::unweighted_iterator_tag tag) const;
166
167    // weighted version
168    template<typename RandomAccessIterator1, typename RandomAccessIterator2>
169    void normalize(const Partitioner& source,RandomAccessIterator1 first, 
170                   RandomAccessIterator1 last, RandomAccessIterator2 result,
171                   utility::weighted_iterator_tag tag) const;
172  };
173
174
175  // template implementations
176
177  template<typename ForwardIterator>
178  qQuantileNormalizer::qQuantileNormalizer(ForwardIterator first, 
179                                           ForwardIterator last,
180                                           unsigned int Q)
181    : target_(Partitioner(first, last, Q))
182  {
183    utility::yat_assert<std::runtime_error>(Q>2,
184                                            "qQuantileNormalizer: Q too small");
185  }
186
187
188  template<typename RandomAccessIterator1, typename RandomAccessIterator2>
189  void qQuantileNormalizer::operator()(RandomAccessIterator1 first, 
190                                       RandomAccessIterator1 last,
191                                       RandomAccessIterator2 result) const
192  {
193    Partitioner source(first, last, target_.size());
194    typename utility::weighted_iterator_traits<RandomAccessIterator2>::type tag;
195    normalize(source, first, last, result, tag);
196  }
197
198
199  template<typename RandomAccessIterator1, typename RandomAccessIterator2>
200  void 
201  qQuantileNormalizer::normalize(const qQuantileNormalizer::Partitioner& source,
202                                 RandomAccessIterator1 first, 
203                                 RandomAccessIterator1 last, 
204                                 RandomAccessIterator2 result,
205                                 utility::unweighted_iterator_tag tag) const
206  {
207    utility::check_iterator_is_unweighted(first);
208    utility::check_iterator_is_unweighted(result);
209    size_t N = last-first;
210    utility::yat_assert<std::runtime_error>
211      (N >= target_.size(), "qQuantileNormalizer: Input range too small");
212
213    std::vector<size_t> sorted_index(last-first);
214    utility::sort_index(first, last, sorted_index);
215
216    utility::Vector diff(source.averages());
217    diff-=target_.averages();
218    const utility::Vector& idx=target_.index();
219    regression::CSplineInterpolation cspline(idx,diff);
220
221    // linear extrapolation for first part, i.e., use first diff for
222    // all points in the first part.
223    size_t start=0;
224    size_t end = static_cast<size_t>(std::ceil(N*idx(0) - 0.5));
225    // take care of limiting case number of parts approximately equal
226    // to the number of elements in range.
227    if (end==0)
228      ++end;
229    for (size_t i=start; i<end; ++i) {
230      size_t si = sorted_index[i];
231      result[si] = first[si] - diff(0);
232    }
233
234    using utility::yat_assert;
235   
236    // cspline interpolation for all data between the mid points of
237    // the first and last part
238    start=end;
239    end = static_cast<size_t>(std::ceil(N*idx(idx.size()-1) - 0.5));
240    if (end>=N)
241      end = N-1;
242    for ( size_t i=start; i<end; ++i) {
243      size_t si = sorted_index[i];
244     
245      yat_assert<std::runtime_error>((i+0.5)/N>idx(0), 
246                               "qQuantileNormalizer: invalid input to cspline");
247      result[si] = first[si] - cspline.evaluate((i+0.5)/N);
248    }
249   
250    // linear extrapolation for last part, i.e., use last diff for
251    // all points in the last part.
252    for (size_t i=end ; i<N; ++i) {
253      size_t si = sorted_index[i];
254      result[si] = first[si] - diff(diff.size()-1);
255    }
256  }
257
258
259  template<typename RandomAccessIterator1, typename RandomAccessIterator2>
260  void qQuantileNormalizer::normalize(const Partitioner& source,
261                                      RandomAccessIterator1 first, 
262                                      RandomAccessIterator1 last, 
263                                      RandomAccessIterator2 result,
264                                      utility::weighted_iterator_tag tag) const
265  {
266    // copy the weights
267    std::copy(utility::weight_iterator(first), 
268              utility::weight_iterator(last), 
269              utility::weight_iterator(result)); 
270
271    double total_w = std::accumulate(utility::weight_iterator(first),
272                                     utility::weight_iterator(last), 0.0);
273
274    std::vector<size_t> sorted_index(last-first);
275    utility::sort_index(utility::data_iterator(first), 
276                        utility::data_iterator(last), sorted_index);
277
278    utility::Vector diff(source.averages());
279    diff-=target_.averages();
280    const utility::Vector& idx=target_.index();
281    regression::CSplineInterpolation cspline(idx,diff);
282
283    double sum_w=0;
284    utility::iterator_traits<RandomAccessIterator1> traits1;
285    utility::iterator_traits<RandomAccessIterator2> traits2;
286    for (size_t i=0; i<sorted_index.size(); ++i) {
287      size_t si = sorted_index[i];
288      double w = (sum_w + 0.5*traits1.weight(first+si))/total_w;
289      double correction = 0;
290      if (w <= idx(0)) {
291        // linear extrapolation for first part, i.e., use first diff for
292        // all points in the first part.
293        correction = diff(0);
294      }
295      else if (w < idx(idx.size()-1) ) {
296        // cspline interpolation for all data between the mid points of
297        // the first and last part
298        correction = cspline.evaluate(w);
299      }
300      else {
301        // linear extrapolation for last part, i.e., use last diff for
302        // all points in the last part.
303        correction = diff(diff.size()-1);
304      }
305      traits2.data(result+si) = traits1.data(first+si) - correction;
306      sum_w += traits1.weight(first+si);
307    }
308  }
309
310
311  template<typename ForwardIterator>
312  qQuantileNormalizer::Partitioner::Partitioner(ForwardIterator first, 
313                                                ForwardIterator last, 
314                                                unsigned int N)
315    : average_(utility::Vector(N)), index_(utility::Vector(N))
316  {
317    build(first, last, N, 
318          typename utility::weighted_iterator_traits<ForwardIterator>::type());
319  }
320
321
322  template<typename ForwardIterator>
323  void qQuantileNormalizer::Partitioner::build(ForwardIterator first, 
324                                               ForwardIterator last, 
325                                               unsigned int N, 
326                                               utility::unweighted_iterator_tag)
327  {
328    utility::Vector vec(std::distance(first, last));
329    std::copy(first, last, vec.begin());
330    std::sort(vec.begin(), vec.end());
331    init(vec, N);
332  }
333
334
335  template<typename ForwardIterator>
336  void qQuantileNormalizer::Partitioner::build(ForwardIterator first, 
337                                               ForwardIterator last, 
338                                               unsigned int N, 
339                                               utility::weighted_iterator_tag)
340  {
341    std::vector<utility::DataWeight> vec;
342    vec.reserve(std::distance(first, last));
343    std::back_insert_iterator<std::vector<utility::DataWeight> > inserter(vec);
344    std::copy(first, last, inserter);
345    std::sort(vec.begin(), vec.end());
346    init(vec, N);
347  }
348
349
350}}} // end of namespace normalizer, yat and thep
351
352#endif
Note: See TracBrowser for help on using the repository browser.