source: trunk/yat/normalizer/qQuantileNormalizer.h @ 1810

Last change on this file since 1810 was 1810, checked in by Peter, 14 years ago

working on documentation. refs #478

  • Property svn:eol-style set to native
  • Property svn:keywords set to Id
File size: 11.8 KB
Line 
1#ifndef _theplu_yat_normalizer_qquantile_normalizer_
2#define _theplu_yat_normalizer_qquantile_normalizer_
3
4/*
5  Copyright (C) 2009 Jari Häkkinen, Peter Johansson
6
7  This file is part of the yat library, http://dev.thep.lu.se/yat
8
9  The yat library is free software; you can redistribute it and/or
10  modify it under the terms of the GNU General Public License as
11  published by the Free Software Foundation; either version 3 of the
12  License, or (at your option) any later version.
13
14  The yat library is distributed in the hope that it will be useful,
15  but WITHOUT ANY WARRANTY; without even the implied warranty of
16  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17  General Public License for more details.
18
19  You should have received a copy of the GNU General Public License
20  along with yat. If not, see <http://www.gnu.org/licenses/>.
21*/
22
23#include "yat/regression/CSplineInterpolation.h"
24#include "yat/utility/DataIterator.h"
25#include "yat/utility/DataWeight.h"
26#include "yat/utility/iterator_traits.h"
27#include "yat/utility/sort_index.h"
28#include "yat/utility/Vector.h"
29#include "yat/utility/WeightIterator.h"
30#include "yat/utility/yat_assert.h"
31
32#include <algorithm>
33#include <cmath>
34#include <iterator>
35#include <numeric>
36#include <stdexcept>
37#include <vector>
38
39namespace theplu {
40namespace yat {
41namespace utility {
42  class VectorBase;
43}
44namespace normalizer {
45
46  /**
47     \brief Perform Q-quantile normalization
48
49     Perform a Q-quantile normalization on a \a source range, after
50     which it will approximately have the same distribution of data as
51     the \a target range (the Q-quantiles are the same). The rank of
52     an element in the \a source range is not changed.
53
54     The class works also with unweighed ranges, and there is no
55     restriction that weighted \a source range requires weighted \a
56     target range or vice versa.
57
58     Normalization goes like this:
59     - Data are not assumed to be sorted.
60     - Partition sorted \a target data in N parts. N must be 3 or larger
61       because of requirements from the underlying cspline fit
62     - Calculate the arithmetic (weighted) mean for each part, the mean is
63       assigned to the mid point of each part.
64     - Do the above for the data to be tranformed (called \a source
65       here).
66     - For each part, calculate the difference between the \a target
67       and \a the source. Now we have \a N differences \f$ d_i \f$
68       with associated rank (midpoint of each part).
69     - Create a cubic spline fit to this difference vector \a d. The
70       resulting curve is used to recalculate all values in \a source.
71       - Use the cubic spline fit for values within the cubic spline
72         fit range [midpoint 1st part, midpoint last part].
73       - For data outside the cubic spline fit use linear
74         extrapolation, i.e., a constant shift. \f$ d_{first} \f$ for
75         points below fit range, and \f$ d_last \f$ for points above fit
76         range.
77
78     \since New in yat 0.5
79   */
80  class qQuantileNormalizer
81  {
82  public:
83    /**
84       \brief Contructor
85
86       \a Q is the number of parts and must be within \f$ [3,N] \f$
87       where \f$ N \f$ is the total number of data points in the
88       target. However, if \f$ N \f$ is larger than the number of points
89       in the data to be normalized the behaviour of the code is
90       undefined. Keep \f$ N \f$ equal to or less than the smallest
91       number of data points in the target or each data set to be
92       normalized against a given target. The lower bound of three is
93       due to restrictions in the cspline fit utilized in the
94       normalization.
95    */
96    template<typename ForwardIterator>
97    qQuantileNormalizer(ForwardIterator first, ForwardIterator last,
98                        unsigned int Q);
99
100    /**
101       \brief perform the Q-quantile normalization.
102
103       It is possible to normalize "in place"; it is permissible for
104       \a first and \a result to be the same.
105     */
106    template<typename RandomAccessIterator1, typename RandomAccessIterator2>
107    void operator()(RandomAccessIterator1 first, RandomAccessIterator1 last,
108                    RandomAccessIterator2 result) const;
109
110  private:
111
112  /**
113     \brief Partition a range of data into equal sizes.
114
115     The class also calculates the average of each part and assigns
116     the average to the mid point of each part. The midpoint is a
117     double, i.e., it is not forced to be an integer index.
118  */
119  class Partitioner
120  {
121  public:
122    /**
123       \brief Create the partition and perform required calculations.
124    */
125    template<typename ForwardIterator>
126    Partitioner(ForwardIterator first, ForwardIterator last, 
127                unsigned int N);
128
129    /**
130       \brief Return the averages for each part.
131
132       \return The average vector.
133    */
134    const utility::Vector& averages(void) const;
135
136    /**
137       \brief Return the mid point for each partition.
138
139       \return The index vector.
140    */
141    const utility::Vector& index(void) const;
142
143    /**
144       \return The number of parts.
145    */
146    size_t size(void) const;
147
148  private:
149    // unweighted "constructor"
150    template<typename ForwardIterator>
151    void build(ForwardIterator first, ForwardIterator last, unsigned int N, 
152               utility::unweighted_iterator_tag);
153    // weighted "constructor"
154    template<typename ForwardIterator>
155    void build(ForwardIterator first, ForwardIterator last, unsigned int N, 
156               utility::weighted_iterator_tag);
157    void init(const utility::VectorBase&, unsigned int N);
158    void init(const std::vector<utility::DataWeight>&, unsigned int N);
159
160    utility::Vector average_;
161    utility::Vector index_;
162  };
163
164    Partitioner target_;
165
166    // unweighted version
167    template<typename RandomAccessIterator1, typename RandomAccessIterator2>
168    void normalize(const Partitioner& source,RandomAccessIterator1 first, 
169                   RandomAccessIterator1 last, RandomAccessIterator2 result,
170                   utility::unweighted_iterator_tag tag) const;
171
172    // weighted version
173    template<typename RandomAccessIterator1, typename RandomAccessIterator2>
174    void normalize(const Partitioner& source,RandomAccessIterator1 first, 
175                   RandomAccessIterator1 last, RandomAccessIterator2 result,
176                   utility::weighted_iterator_tag tag) const;
177  };
178
179
180  // template implementations
181
182  template<typename ForwardIterator>
183  qQuantileNormalizer::qQuantileNormalizer(ForwardIterator first, 
184                                           ForwardIterator last,
185                                           unsigned int Q)
186    : target_(Partitioner(first, last, Q))
187  {
188    utility::yat_assert<std::runtime_error>(Q>2,
189                                            "qQuantileNormalizer: Q too small");
190  }
191
192
193  template<typename RandomAccessIterator1, typename RandomAccessIterator2>
194  void qQuantileNormalizer::operator()(RandomAccessIterator1 first, 
195                                       RandomAccessIterator1 last,
196                                       RandomAccessIterator2 result) const
197  {
198    Partitioner source(first, last, target_.size());
199    typename utility::weighted_iterator_traits<RandomAccessIterator2>::type tag;
200    normalize(source, first, last, result, tag);
201  }
202
203
204  template<typename RandomAccessIterator1, typename RandomAccessIterator2>
205  void 
206  qQuantileNormalizer::normalize(const qQuantileNormalizer::Partitioner& source,
207                                 RandomAccessIterator1 first, 
208                                 RandomAccessIterator1 last, 
209                                 RandomAccessIterator2 result,
210                                 utility::unweighted_iterator_tag tag) const
211  {
212    utility::check_iterator_is_unweighted(first);
213    utility::check_iterator_is_unweighted(result);
214    size_t N = last-first;
215    utility::yat_assert<std::runtime_error>
216      (N >= target_.size(), "qQuantileNormalizer: Input range too small");
217
218    std::vector<size_t> sorted_index(last-first);
219    utility::sort_index(first, last, sorted_index);
220
221    utility::Vector diff(source.averages());
222    diff-=target_.averages();
223    const utility::Vector& idx=target_.index();
224    regression::CSplineInterpolation cspline(idx,diff);
225
226    // linear extrapolation for first part, i.e., use first diff for
227    // all points in the first part.
228    size_t start=0;
229    size_t end = static_cast<size_t>(std::ceil(N*idx(0) - 0.5));
230    // take care of limiting case number of parts approximately equal
231    // to the number of elements in range.
232    if (end==0)
233      ++end;
234    for (size_t i=start; i<end; ++i) {
235      size_t si = sorted_index[i];
236      result[si] = first[si] - diff(0);
237    }
238
239    using utility::yat_assert;
240   
241    // cspline interpolation for all data between the mid points of
242    // the first and last part
243    start=end;
244    end = static_cast<size_t>(std::ceil(N*idx(idx.size()-1) - 0.5));
245    if (end>=N)
246      end = N-1;
247    for ( size_t i=start; i<end; ++i) {
248      size_t si = sorted_index[i];
249     
250      yat_assert<std::runtime_error>((i+0.5)/N>idx(0), 
251                               "qQuantileNormalizer: invalid input to cspline");
252      result[si] = first[si] - cspline.evaluate((i+0.5)/N);
253    }
254   
255    // linear extrapolation for last part, i.e., use last diff for
256    // all points in the last part.
257    for (size_t i=end ; i<N; ++i) {
258      size_t si = sorted_index[i];
259      result[si] = first[si] - diff(diff.size()-1);
260    }
261  }
262
263
264  template<typename RandomAccessIterator1, typename RandomAccessIterator2>
265  void qQuantileNormalizer::normalize(const Partitioner& source,
266                                      RandomAccessIterator1 first, 
267                                      RandomAccessIterator1 last, 
268                                      RandomAccessIterator2 result,
269                                      utility::weighted_iterator_tag tag) const
270  {
271    // copy the weights
272    std::copy(utility::weight_iterator(first), 
273              utility::weight_iterator(last), 
274              utility::weight_iterator(result)); 
275
276    double total_w = std::accumulate(utility::weight_iterator(first),
277                                     utility::weight_iterator(last), 0.0);
278
279    std::vector<size_t> sorted_index(last-first);
280    utility::sort_index(utility::data_iterator(first), 
281                        utility::data_iterator(last), sorted_index);
282
283    utility::Vector diff(source.averages());
284    diff-=target_.averages();
285    const utility::Vector& idx=target_.index();
286    regression::CSplineInterpolation cspline(idx,diff);
287
288    double sum_w=0;
289    utility::iterator_traits<RandomAccessIterator1> traits1;
290    utility::iterator_traits<RandomAccessIterator2> traits2;
291    for (size_t i=0; i<sorted_index.size(); ++i) {
292      size_t si = sorted_index[i];
293      double w = (sum_w + 0.5*traits1.weight(first+si))/total_w;
294      double correction = 0;
295      if (w <= idx(0)) {
296        // linear extrapolation for first part, i.e., use first diff for
297        // all points in the first part.
298        correction = diff(0);
299      }
300      else if (w < idx(idx.size()-1) ) {
301        // cspline interpolation for all data between the mid points of
302        // the first and last part
303        correction = cspline.evaluate(w);
304      }
305      else {
306        // linear extrapolation for last part, i.e., use last diff for
307        // all points in the last part.
308        correction = diff(diff.size()-1);
309      }
310      traits2.data(result+si) = traits1.data(first+si) - correction;
311      sum_w += traits1.weight(first+si);
312    }
313  }
314
315
316  template<typename ForwardIterator>
317  qQuantileNormalizer::Partitioner::Partitioner(ForwardIterator first, 
318                                                ForwardIterator last, 
319                                                unsigned int N)
320    : average_(utility::Vector(N)), index_(utility::Vector(N))
321  {
322    build(first, last, N, 
323          typename utility::weighted_iterator_traits<ForwardIterator>::type());
324  }
325
326
327  template<typename ForwardIterator>
328  void qQuantileNormalizer::Partitioner::build(ForwardIterator first, 
329                                               ForwardIterator last, 
330                                               unsigned int N, 
331                                               utility::unweighted_iterator_tag)
332  {
333    utility::Vector vec(std::distance(first, last));
334    std::copy(first, last, vec.begin());
335    std::sort(vec.begin(), vec.end());
336    init(vec, N);
337  }
338
339
340  template<typename ForwardIterator>
341  void qQuantileNormalizer::Partitioner::build(ForwardIterator first, 
342                                               ForwardIterator last, 
343                                               unsigned int N, 
344                                               utility::weighted_iterator_tag)
345  {
346    std::vector<utility::DataWeight> vec;
347    vec.reserve(std::distance(first, last));
348    std::back_insert_iterator<std::vector<utility::DataWeight> > inserter(vec);
349    std::copy(first, last, inserter);
350    std::sort(vec.begin(), vec.end());
351    init(vec, N);
352  }
353
354
355}}} // end of namespace normalizer, yat and thep
356
357#endif
Note: See TracBrowser for help on using the repository browser.