source: trunk/yat/normalizer/qQuantileNormalizer.h @ 2150

Last change on this file since 2150 was 2150, checked in by Peter, 13 years ago

fixes #584

  • Property svn:eol-style set to native
  • Property svn:keywords set to Id
File size: 14.7 KB
Line 
1#ifndef _theplu_yat_normalizer_qquantile_normalizer_
2#define _theplu_yat_normalizer_qquantile_normalizer_
3
4/*
5  Copyright (C) 2009 Jari Häkkinen, Peter Johansson
6  Copyright (C) 2010 Peter Johansson
7
8  This file is part of the yat library, http://dev.thep.lu.se/yat
9
10  The yat library is free software; you can redistribute it and/or
11  modify it under the terms of the GNU General Public License as
12  published by the Free Software Foundation; either version 3 of the
13  License, or (at your option) any later version.
14
15  The yat library is distributed in the hope that it will be useful,
16  but WITHOUT ANY WARRANTY; without even the implied warranty of
17  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18  General Public License for more details.
19
20  You should have received a copy of the GNU General Public License
21  along with yat. If not, see <http://www.gnu.org/licenses/>.
22*/
23
24#include "yat/regression/CSplineInterpolation.h"
25#include "yat/utility/DataIterator.h"
26#include "yat/utility/DataWeight.h"
27#include "yat/utility/iterator_traits.h"
28#include "yat/utility/sort_index.h"
29#include "yat/utility/Vector.h"
30#include "yat/utility/WeightIterator.h"
31#include "yat/utility/yat_assert.h"
32
33#include <boost/concept_check.hpp>
34
35#include <algorithm>
36#include <cmath>
37#include <iterator>
38#include <limits>
39#include <numeric>
40#include <stdexcept>
41#include <vector>
42
43namespace theplu {
44namespace yat {
45namespace utility {
46  class VectorBase;
47}
48namespace normalizer {
49
50  /**
51     \brief Perform Q-quantile normalization
52
53     Perform a Q-quantile normalization on a \a source range, after
54     which it will approximately have the same distribution of data as
55     the \a target range (the Q-quantiles are the same). The rank of
56     an element in the \a source range is not changed.
57
58     The class works also with unweighed ranges, and there is no
59     restriction that weighted \a source range requires weighted \a
60     target range or vice versa.
61
62     Normalization goes like this:
63     - Data are not assumed to be sorted.
64     - Partition sorted \a target data in Q parts. Q must be 3 or larger
65       because of requirements from the underlying cubic spline fit
66     - Calculate the arithmetic (weighted) mean for each part, the mean is
67       assigned to the mid point of each part.
68     - Do the above for the data to be transformed (called \a source
69       here).
70     - For each part, calculate the difference between the \a target
71       and \a the source. Now we have \a Q differences \f$ d_i \f$
72       with associated rank (midpoint of each part).
73     - Create a cubic spline fit to this difference vector \a d. The
74       resulting curve is used to recalculate all values in \a source.
75       - Use the cubic spline fit for values within the cubic spline
76         fit range [midpoint 1st part, midpoint last part].
77       - For data outside the cubic spline fit use linear
78         extrapolation, i.e., a constant shift. \f$ d_{first} \f$ for
79         points below fit range, and \f$ d_{last} \f$ for points above fit
80         range.
81
82     \since New in yat 0.5
83   */
84  class qQuantileNormalizer
85  {
86  public:
87    /**
88       \brief Constructor
89
90       Divides a sorted copy of range [\a first,\a last) into \a Q
91       parts. Parts are divided such that the sum of weights is
92       approximately the same in the different parts. If a relative
93       weight, \f$ w_i / \sum w_i \f$, is larger than 1/Q this might
94       be difficult to achieve, in which case a an exception is
95       thrown. In the unweighted case this implies that \a Q should be
96       smaller (or equal) than number of elements in [\a first, \a
97       last).
98
99       The range supplied to the constructor sets the target
100       distribution.
101
102       As the \a source range is also divided into \a Q parts (when
103       operator() is called), it is recommended to keep \a Q smaller
104       (or equal) than the size of ranges that will be normalized.
105
106       Also, \a Q must not be smaller than 3 due to restrictions in
107       the cubic spline fit utilized in the normalization.
108
109       \b Type \b Requirements:
110       - \c ForwardIterator is a model of \forward_iterator.
111       - \c ForwardIterator is a \ref concept_data_iterator
112    */
113    template<typename ForwardIterator>
114    qQuantileNormalizer(ForwardIterator first, ForwardIterator last,
115                        unsigned int Q);
116
117    /**
118       \brief Perform the Q-quantile normalization.
119
120       Elements in [\a first, \a last) are normalized as described
121       above and the result is assigned to [\a result, \a result + \a
122       last-\a first). Input range [\a first, \a last) is not
123       modified. If ranges are weighted, the weights are copied from
124       [\a first, \a last) to \a result range.
125
126       It is possible to normalize "in place"; it is permissible for
127       \a first and \a result to be the same. However, as assignment
128       occurs sequentially, the operation is undefined if \a result is
129       the same as any in range [\a first, \a last).
130
131       \b Type Requirements:
132       - \c RandomAccessIterator1 is a model of \random_access_iterator
133       - \c RandomAccessIterator1 is a \ref concept_data_iterator
134       - \c RandomAccessIterator1's value type is convertible to
135         \c RandomAccessIterator2's value type
136       - \c RandomAccessIterator2 is a model of \random_access_iterator
137       - \c RandomAccessIterator2 is mutable.
138
139     */
140    template<typename RandomAccessIterator1, typename RandomAccessIterator2>
141    void operator()(RandomAccessIterator1 first, RandomAccessIterator1 last,
142                    RandomAccessIterator2 result) const;
143
144  private:
145
146  /**
147     \brief Partition a range of data into equal sizes.
148
149     Copy the range [first, last), sort the copy, and divide the
150     sorted copy in Q parts. The parts are created such that the total
151     weight in a part is approximately W/Q where W is the total weight
152     (over all parts). The class calculates the average value in each
153     part and also the "quantile".
154  */
155  class Partitioner
156  {
157  public:
158    /**
159       \brief Create the partition and perform required calculations.
160    */
161    template<typename ForwardIterator>
162    Partitioner(ForwardIterator first, ForwardIterator last, 
163                unsigned int N);
164
165    /**
166       \brief Return the averages for each part.
167
168       \return The average vector.
169    */
170    const utility::Vector& averages(void) const;
171
172    /**
173       The quantile (here) is defined as (w_lower + w_upper) / 2W,
174       where w_lower is the total weight of elements smaller than the
175       smallest element in the part, and w_upper is the total weight
176       of elements smaller (or equal) than the largest value in the
177       part. 
178
179       In the unweighted case all weights are 1.0, which implies q_0 =
180       n_0/N, q_1 = (n_0+n_1/2)/N, q_2 = (n_0+n_1+n_2/2)/N where n_i
181       is number of elements in ith part.
182
183       \return The quantiles vector.
184    */
185    const utility::Vector& quantiles(void) const;
186
187    /**
188       \return The number of parts.
189    */
190    size_t size(void) const;
191
192  private:
193    // unweighted "constructor"
194    template<typename ForwardIterator>
195    void build(ForwardIterator first, ForwardIterator last, unsigned int N, 
196               utility::unweighted_iterator_tag);
197    // weighted "constructor"
198    template<typename ForwardIterator>
199    void build(ForwardIterator first, ForwardIterator last, unsigned int N, 
200               utility::weighted_iterator_tag);
201    void init(const utility::VectorBase&, unsigned int N);
202    void init(const std::vector<utility::DataWeight>&, unsigned int N);
203
204    utility::Vector average_;
205    utility::Vector quantiles_;
206  };
207
208    Partitioner target_;
209
210    // unweighted version
211    template<typename RandomAccessIterator1, typename RandomAccessIterator2>
212    void normalize(const Partitioner& source,RandomAccessIterator1 first, 
213                   RandomAccessIterator1 last, RandomAccessIterator2 result,
214                   utility::unweighted_iterator_tag tag) const;
215
216    // weighted version
217    template<typename RandomAccessIterator1, typename RandomAccessIterator2>
218    void normalize(const Partitioner& source,RandomAccessIterator1 first, 
219                   RandomAccessIterator1 last, RandomAccessIterator2 result,
220                   utility::weighted_iterator_tag tag) const;
221  };
222
223
224  // template implementations
225
226  template<typename ForwardIterator>
227  qQuantileNormalizer::qQuantileNormalizer(ForwardIterator first, 
228                                           ForwardIterator last,
229                                           unsigned int Q)
230    : target_(Partitioner(first, last, Q))
231  {
232    utility::yat_assert<std::runtime_error>(Q>2,
233                                            "qQuantileNormalizer: Q too small");
234  }
235
236
237  template<typename RandomAccessIterator1, typename RandomAccessIterator2>
238  void qQuantileNormalizer::operator()(RandomAccessIterator1 first, 
239                                       RandomAccessIterator1 last,
240                                       RandomAccessIterator2 result) const
241  {
242    Partitioner source(first, last, target_.size());
243    typename utility::weighted_iterator_traits<RandomAccessIterator2>::type tag;
244    normalize(source, first, last, result, tag);
245  }
246
247
248  template<typename RandomAccessIterator1, typename RandomAccessIterator2>
249  void 
250  qQuantileNormalizer::normalize(const qQuantileNormalizer::Partitioner& source,
251                                 RandomAccessIterator1 first, 
252                                 RandomAccessIterator1 last, 
253                                 RandomAccessIterator2 result,
254                                 utility::unweighted_iterator_tag tag) const
255  {
256    using boost::function_requires;
257    function_requires<boost::RandomAccessIterator<RandomAccessIterator1> >();
258    function_requires<boost::Mutable_RandomAccessIterator<RandomAccessIterator2> >();
259    utility::check_iterator_is_unweighted(first);
260    utility::check_iterator_is_unweighted(result);
261    size_t N = last-first;
262    utility::yat_assert<std::runtime_error>
263      (N >= target_.size(), "qQuantileNormalizer: Input range too small");
264
265    std::vector<size_t> sorted_index(last-first);
266    utility::sort_index(first, last, sorted_index);
267
268    utility::Vector diff(source.averages());
269    diff-=target_.averages();
270    const utility::Vector& idx=target_.quantiles();
271    regression::CSplineInterpolation cspline(idx,diff);
272
273    // linear extrapolation for first part, i.e., use first diff for
274    // all points in the first part.
275    size_t start=0;
276    size_t end = static_cast<size_t>(std::ceil(N*idx(0) - 0.5));
277    // take care of limiting case number of parts approximately equal
278    // to the number of elements in range.
279    if (end==0)
280      ++end;
281    for (size_t i=start; i<end; ++i) {
282      size_t si = sorted_index[i];
283      result[si] = first[si] - diff(0);
284    }
285
286    using utility::yat_assert;
287   
288    // cspline interpolation for all data between the mid points of
289    // the first and last part
290    start=end;
291    end = static_cast<size_t>(std::ceil(N*idx(idx.size()-1) - 0.5));
292    if (end>=N)
293      end = N-1;
294    for ( size_t i=start; i<end; ++i) {
295      size_t si = sorted_index[i];
296     
297      yat_assert<std::runtime_error>((i+0.5)/N>idx(0), 
298                               "qQuantileNormalizer: invalid input to cspline");
299      result[si] = first[si] - cspline.evaluate((i+0.5)/N);
300    }
301   
302    // linear extrapolation for last part, i.e., use last diff for
303    // all points in the last part.
304    for (size_t i=end ; i<N; ++i) {
305      size_t si = sorted_index[i];
306      result[si] = first[si] - diff(diff.size()-1);
307    }
308  }
309
310
311  template<typename RandomAccessIterator1, typename RandomAccessIterator2>
312  void qQuantileNormalizer::normalize(const Partitioner& source,
313                                      RandomAccessIterator1 first, 
314                                      RandomAccessIterator1 last, 
315                                      RandomAccessIterator2 result,
316                                      utility::weighted_iterator_tag tag) const
317  {
318    using boost::function_requires;
319    function_requires<boost::RandomAccessIterator<RandomAccessIterator1> >();
320    function_requires<boost::Mutable_RandomAccessIterator<RandomAccessIterator2> >();
321    // copy the weights
322    std::copy(utility::weight_iterator(first), 
323              utility::weight_iterator(last), 
324              utility::weight_iterator(result)); 
325
326    double total_w = std::accumulate(utility::weight_iterator(first),
327                                     utility::weight_iterator(last), 0.0);
328
329    std::vector<size_t> sorted_index(last-first);
330    // Code to avoid problems with NaN (ticket:535)
331    // utility::sort_index(utility::data_iterator(first),
332    //                     utility::data_iterator(last), sorted_index);
333    // ... above statement replaced below code block
334    {
335      using namespace utility;
336      std::vector<double> vec;
337      vec.reserve(std::distance(first, last));
338      std::copy(utility::data_iterator(first), utility::data_iterator(last), 
339                std::back_inserter(vec));
340      for (std::vector<double>::iterator i(vec.begin()); i!=vec.end(); ++i)
341        if (std::isnan(*i))
342          *i = std::numeric_limits<double>::infinity();
343      utility::sort_index(vec.begin(), vec.end(), sorted_index);
344    }
345    // end Code to avoid problems with NaN (ticket:535)
346
347    utility::Vector diff(source.averages());
348    diff-=target_.averages();
349    const utility::Vector& idx=target_.quantiles();
350    regression::CSplineInterpolation cspline(idx,diff);
351
352    double sum_w=0;
353    utility::iterator_traits<RandomAccessIterator1> traits1;
354    utility::iterator_traits<RandomAccessIterator2> traits2;
355    for (size_t i=0; i<sorted_index.size(); ++i) {
356      size_t si = sorted_index[i];
357      double w = (sum_w + 0.5*traits1.weight(first+si))/total_w;
358      double correction = 0;
359      if (w <= idx(0)) {
360        // linear extrapolation for first part, i.e., use first diff for
361        // all points in the first part.
362        correction = diff(0);
363      }
364      else if (w < idx(idx.size()-1) ) {
365        // cspline interpolation for all data between the mid points of
366        // the first and last part
367        correction = cspline.evaluate(w);
368      }
369      else {
370        // linear extrapolation for last part, i.e., use last diff for
371        // all points in the last part.
372        correction = diff(diff.size()-1);
373      }
374      traits2.data(result+si) = traits1.data(first+si) - correction;
375      sum_w += traits1.weight(first+si);
376    }
377  }
378
379
380  template<typename ForwardIterator>
381  qQuantileNormalizer::Partitioner::Partitioner(ForwardIterator first, 
382                                                ForwardIterator last, 
383                                                unsigned int N)
384    : average_(utility::Vector(N)), quantiles_(utility::Vector(N))
385  {
386    boost::function_requires<boost::ForwardIterator<ForwardIterator> >();
387    build(first, last, N, 
388          typename utility::weighted_iterator_traits<ForwardIterator>::type());
389  }
390
391
392  template<typename ForwardIterator>
393  void qQuantileNormalizer::Partitioner::build(ForwardIterator first, 
394                                               ForwardIterator last, 
395                                               unsigned int N, 
396                                               utility::unweighted_iterator_tag)
397  {
398    utility::Vector vec(std::distance(first, last));
399    std::copy(first, last, vec.begin());
400    std::sort(vec.begin(), vec.end());
401    init(vec, N);
402  }
403
404
405  template<typename ForwardIterator>
406  void qQuantileNormalizer::Partitioner::build(ForwardIterator first, 
407                                               ForwardIterator last, 
408                                               unsigned int N, 
409                                               utility::weighted_iterator_tag)
410  {
411    using namespace utility;
412    std::vector<DataWeight> vec;
413    vec.reserve(std::distance(first, last));
414    std::back_insert_iterator<std::vector<DataWeight> > inserter(vec);
415    std::copy(first, last, inserter);
416    std::sort(vec.begin(), vec.end());
417    init(vec, N);
418  }
419
420
421}}} // end of namespace normalizer, yat and thep
422
423#endif
Note: See TracBrowser for help on using the repository browser.