source: trunk/yat/normalizer/qQuantileNormalizer.h @ 1813

Last change on this file since 1813 was 1813, checked in by Peter, 13 years ago

Changing name of function Partition::index to Partition::quantiles, which is more descriptive after recent changes.

  • Property svn:eol-style set to native
  • Property svn:keywords set to Id
File size: 13.0 KB
Line 
1#ifndef _theplu_yat_normalizer_qquantile_normalizer_
2#define _theplu_yat_normalizer_qquantile_normalizer_
3
4/*
5  Copyright (C) 2009 Jari Häkkinen, Peter Johansson
6
7  This file is part of the yat library, http://dev.thep.lu.se/yat
8
9  The yat library is free software; you can redistribute it and/or
10  modify it under the terms of the GNU General Public License as
11  published by the Free Software Foundation; either version 3 of the
12  License, or (at your option) any later version.
13
14  The yat library is distributed in the hope that it will be useful,
15  but WITHOUT ANY WARRANTY; without even the implied warranty of
16  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17  General Public License for more details.
18
19  You should have received a copy of the GNU General Public License
20  along with yat. If not, see <http://www.gnu.org/licenses/>.
21*/
22
23#include "yat/regression/CSplineInterpolation.h"
24#include "yat/utility/DataIterator.h"
25#include "yat/utility/DataWeight.h"
26#include "yat/utility/iterator_traits.h"
27#include "yat/utility/sort_index.h"
28#include "yat/utility/Vector.h"
29#include "yat/utility/WeightIterator.h"
30#include "yat/utility/yat_assert.h"
31
32#include <algorithm>
33#include <cmath>
34#include <iterator>
35#include <numeric>
36#include <stdexcept>
37#include <vector>
38
39namespace theplu {
40namespace yat {
41namespace utility {
42  class VectorBase;
43}
44namespace normalizer {
45
46  /**
47     \brief Perform Q-quantile normalization
48
49     Perform a Q-quantile normalization on a \a source range, after
50     which it will approximately have the same distribution of data as
51     the \a target range (the Q-quantiles are the same). The rank of
52     an element in the \a source range is not changed.
53
54     The class works also with unweighed ranges, and there is no
55     restriction that weighted \a source range requires weighted \a
56     target range or vice versa.
57
58     Normalization goes like this:
59     - Data are not assumed to be sorted.
60     - Partition sorted \a target data in N parts. N must be 3 or larger
61       because of requirements from the underlying cspline fit
62     - Calculate the arithmetic (weighted) mean for each part, the mean is
63       assigned to the mid point of each part.
64     - Do the above for the data to be tranformed (called \a source
65       here).
66     - For each part, calculate the difference between the \a target
67       and \a the source. Now we have \a N differences \f$ d_i \f$
68       with associated rank (midpoint of each part).
69     - Create a cubic spline fit to this difference vector \a d. The
70       resulting curve is used to recalculate all values in \a source.
71       - Use the cubic spline fit for values within the cubic spline
72         fit range [midpoint 1st part, midpoint last part].
73       - For data outside the cubic spline fit use linear
74         extrapolation, i.e., a constant shift. \f$ d_{first} \f$ for
75         points below fit range, and \f$ d_{last} \f$ for points above fit
76         range.
77
78     \since New in yat 0.5
79   */
80  class qQuantileNormalizer
81  {
82  public:
83    /**
84       \brief Contructor
85
86       Divides a sorted copy of range [\a first,\a last) into \a Q
87       parts. Parts are divided such that the sum of weights is
88       approximately the same in the different parts. If a relative
89       weight, \f$ w_i / \sum w_i \f$, is larger than 1/Q this might
90       be difficult to achieve, in which case a an exception is
91       thrown. In the unweighted case this implies that \a Q should be
92       smaller (or equal) than number of elements in [\a first, \a
93       last).
94
95       As the \a source range is also divided into \a Q parts (in
96       operator()), it is recommended to keep \a Q smaller (or equal)
97       than the size of ranges that will be normalized.
98
99       Also, \a Q must not be smaller than 3 due to restrictions in
100       the cspline fit utilized in the normalization.
101
102       Requirements:
103       - ForwardIterator must be a model of Forward Iterator
104       - ForwardIterator's value type is either convertible to double
105         or in the weighted case convertible to utility::DataWeight.
106    */
107    template<typename ForwardIterator>
108    qQuantileNormalizer(ForwardIterator first, ForwardIterator last,
109                        unsigned int Q);
110
111    /**
112       \brief perform the Q-quantile normalization.
113
114       Elements in [\a first, \a last) are normalized as described
115       above and the result is assigned to [\a result, \a result + \a
116       last-\a first). In the weighted case, weights are copied from
117
118       It is possible to normalize "in place"; it is permissible for
119       \a first and \a result to be the same. However, as assignment
120       occurs sequentially, the operation is undefined if \a result is
121       the same as any in range [\a first, \a last).
122
123       Requirements:
124       - RandomAccessIterator1 is a model of Random Access Iterator
125       - RandomAccessIterator2 is a model of Random Access Iterator
126       - RandomAccessIterator2 is mutable
127       - Either RandomAccessIterator1's and RandomAccessIterator2's
128         value types are convertible to double OR
129         RandomAccessIterator1 and RandomAccessIterator2 are models of
130         \ref concept_weighted_iterator
131     */
132    template<typename RandomAccessIterator1, typename RandomAccessIterator2>
133    void operator()(RandomAccessIterator1 first, RandomAccessIterator1 last,
134                    RandomAccessIterator2 result) const;
135
136  private:
137
138  /**
139     \brief Partition a range of data into equal sizes.
140
141     The class also calculates the average of each part and assigns
142     the average to the mid point of each part. The midpoint is a
143     double, i.e., it is not forced to be an integer index.
144  */
145  class Partitioner
146  {
147  public:
148    /**
149       \brief Create the partition and perform required calculations.
150    */
151    template<typename ForwardIterator>
152    Partitioner(ForwardIterator first, ForwardIterator last, 
153                unsigned int N);
154
155    /**
156       \brief Return the averages for each part.
157
158       \return The average vector.
159    */
160    const utility::Vector& averages(void) const;
161
162    /**
163       \brief Return the mid point for each partition.
164
165       \return The quantiles vector.
166    */
167    const utility::Vector& quantiles(void) const;
168
169    /**
170       \return The number of parts.
171    */
172    size_t size(void) const;
173
174  private:
175    // unweighted "constructor"
176    template<typename ForwardIterator>
177    void build(ForwardIterator first, ForwardIterator last, unsigned int N, 
178               utility::unweighted_iterator_tag);
179    // weighted "constructor"
180    template<typename ForwardIterator>
181    void build(ForwardIterator first, ForwardIterator last, unsigned int N, 
182               utility::weighted_iterator_tag);
183    void init(const utility::VectorBase&, unsigned int N);
184    void init(const std::vector<utility::DataWeight>&, unsigned int N);
185
186    utility::Vector average_;
187    utility::Vector quantiles_;
188  };
189
190    Partitioner target_;
191
192    // unweighted version
193    template<typename RandomAccessIterator1, typename RandomAccessIterator2>
194    void normalize(const Partitioner& source,RandomAccessIterator1 first, 
195                   RandomAccessIterator1 last, RandomAccessIterator2 result,
196                   utility::unweighted_iterator_tag tag) const;
197
198    // weighted version
199    template<typename RandomAccessIterator1, typename RandomAccessIterator2>
200    void normalize(const Partitioner& source,RandomAccessIterator1 first, 
201                   RandomAccessIterator1 last, RandomAccessIterator2 result,
202                   utility::weighted_iterator_tag tag) const;
203  };
204
205
206  // template implementations
207
208  template<typename ForwardIterator>
209  qQuantileNormalizer::qQuantileNormalizer(ForwardIterator first, 
210                                           ForwardIterator last,
211                                           unsigned int Q)
212    : target_(Partitioner(first, last, Q))
213  {
214    utility::yat_assert<std::runtime_error>(Q>2,
215                                            "qQuantileNormalizer: Q too small");
216  }
217
218
219  template<typename RandomAccessIterator1, typename RandomAccessIterator2>
220  void qQuantileNormalizer::operator()(RandomAccessIterator1 first, 
221                                       RandomAccessIterator1 last,
222                                       RandomAccessIterator2 result) const
223  {
224    Partitioner source(first, last, target_.size());
225    typename utility::weighted_iterator_traits<RandomAccessIterator2>::type tag;
226    normalize(source, first, last, result, tag);
227  }
228
229
230  template<typename RandomAccessIterator1, typename RandomAccessIterator2>
231  void 
232  qQuantileNormalizer::normalize(const qQuantileNormalizer::Partitioner& source,
233                                 RandomAccessIterator1 first, 
234                                 RandomAccessIterator1 last, 
235                                 RandomAccessIterator2 result,
236                                 utility::unweighted_iterator_tag tag) const
237  {
238    utility::check_iterator_is_unweighted(first);
239    utility::check_iterator_is_unweighted(result);
240    size_t N = last-first;
241    utility::yat_assert<std::runtime_error>
242      (N >= target_.size(), "qQuantileNormalizer: Input range too small");
243
244    std::vector<size_t> sorted_index(last-first);
245    utility::sort_index(first, last, sorted_index);
246
247    utility::Vector diff(source.averages());
248    diff-=target_.averages();
249    const utility::Vector& idx=target_.quantiles();
250    regression::CSplineInterpolation cspline(idx,diff);
251
252    // linear extrapolation for first part, i.e., use first diff for
253    // all points in the first part.
254    size_t start=0;
255    size_t end = static_cast<size_t>(std::ceil(N*idx(0) - 0.5));
256    // take care of limiting case number of parts approximately equal
257    // to the number of elements in range.
258    if (end==0)
259      ++end;
260    for (size_t i=start; i<end; ++i) {
261      size_t si = sorted_index[i];
262      result[si] = first[si] - diff(0);
263    }
264
265    using utility::yat_assert;
266   
267    // cspline interpolation for all data between the mid points of
268    // the first and last part
269    start=end;
270    end = static_cast<size_t>(std::ceil(N*idx(idx.size()-1) - 0.5));
271    if (end>=N)
272      end = N-1;
273    for ( size_t i=start; i<end; ++i) {
274      size_t si = sorted_index[i];
275     
276      yat_assert<std::runtime_error>((i+0.5)/N>idx(0), 
277                               "qQuantileNormalizer: invalid input to cspline");
278      result[si] = first[si] - cspline.evaluate((i+0.5)/N);
279    }
280   
281    // linear extrapolation for last part, i.e., use last diff for
282    // all points in the last part.
283    for (size_t i=end ; i<N; ++i) {
284      size_t si = sorted_index[i];
285      result[si] = first[si] - diff(diff.size()-1);
286    }
287  }
288
289
290  template<typename RandomAccessIterator1, typename RandomAccessIterator2>
291  void qQuantileNormalizer::normalize(const Partitioner& source,
292                                      RandomAccessIterator1 first, 
293                                      RandomAccessIterator1 last, 
294                                      RandomAccessIterator2 result,
295                                      utility::weighted_iterator_tag tag) const
296  {
297    // copy the weights
298    std::copy(utility::weight_iterator(first), 
299              utility::weight_iterator(last), 
300              utility::weight_iterator(result)); 
301
302    double total_w = std::accumulate(utility::weight_iterator(first),
303                                     utility::weight_iterator(last), 0.0);
304
305    std::vector<size_t> sorted_index(last-first);
306    utility::sort_index(utility::data_iterator(first), 
307                        utility::data_iterator(last), sorted_index);
308
309    utility::Vector diff(source.averages());
310    diff-=target_.averages();
311    const utility::Vector& idx=target_.quantiles();
312    regression::CSplineInterpolation cspline(idx,diff);
313
314    double sum_w=0;
315    utility::iterator_traits<RandomAccessIterator1> traits1;
316    utility::iterator_traits<RandomAccessIterator2> traits2;
317    for (size_t i=0; i<sorted_index.size(); ++i) {
318      size_t si = sorted_index[i];
319      double w = (sum_w + 0.5*traits1.weight(first+si))/total_w;
320      double correction = 0;
321      if (w <= idx(0)) {
322        // linear extrapolation for first part, i.e., use first diff for
323        // all points in the first part.
324        correction = diff(0);
325      }
326      else if (w < idx(idx.size()-1) ) {
327        // cspline interpolation for all data between the mid points of
328        // the first and last part
329        correction = cspline.evaluate(w);
330      }
331      else {
332        // linear extrapolation for last part, i.e., use last diff for
333        // all points in the last part.
334        correction = diff(diff.size()-1);
335      }
336      traits2.data(result+si) = traits1.data(first+si) - correction;
337      sum_w += traits1.weight(first+si);
338    }
339  }
340
341
342  template<typename ForwardIterator>
343  qQuantileNormalizer::Partitioner::Partitioner(ForwardIterator first, 
344                                                ForwardIterator last, 
345                                                unsigned int N)
346    : average_(utility::Vector(N)), quantiles_(utility::Vector(N))
347  {
348    build(first, last, N, 
349          typename utility::weighted_iterator_traits<ForwardIterator>::type());
350  }
351
352
353  template<typename ForwardIterator>
354  void qQuantileNormalizer::Partitioner::build(ForwardIterator first, 
355                                               ForwardIterator last, 
356                                               unsigned int N, 
357                                               utility::unweighted_iterator_tag)
358  {
359    utility::Vector vec(std::distance(first, last));
360    std::copy(first, last, vec.begin());
361    std::sort(vec.begin(), vec.end());
362    init(vec, N);
363  }
364
365
366  template<typename ForwardIterator>
367  void qQuantileNormalizer::Partitioner::build(ForwardIterator first, 
368                                               ForwardIterator last, 
369                                               unsigned int N, 
370                                               utility::weighted_iterator_tag)
371  {
372    std::vector<utility::DataWeight> vec;
373    vec.reserve(std::distance(first, last));
374    std::back_insert_iterator<std::vector<utility::DataWeight> > inserter(vec);
375    std::copy(first, last, inserter);
376    std::sort(vec.begin(), vec.end());
377    init(vec, N);
378  }
379
380
381}}} // end of namespace normalizer, yat and thep
382
383#endif
Note: See TracBrowser for help on using the repository browser.