source: trunk/yat/normalizer/qQuantileNormalizer.h @ 2158

Last change on this file since 2158 was 2158, checked in by Peter, 13 years ago

qQuantileNormalizer now works with mixed iterators. closes #498

  • Property svn:eol-style set to native
  • Property svn:keywords set to Id
File size: 14.6 KB
Line 
1#ifndef _theplu_yat_normalizer_qquantile_normalizer_
2#define _theplu_yat_normalizer_qquantile_normalizer_
3
4/*
5  Copyright (C) 2009 Jari Häkkinen, Peter Johansson
6  Copyright (C) 2010 Peter Johansson
7
8  This file is part of the yat library, http://dev.thep.lu.se/yat
9
10  The yat library is free software; you can redistribute it and/or
11  modify it under the terms of the GNU General Public License as
12  published by the Free Software Foundation; either version 3 of the
13  License, or (at your option) any later version.
14
15  The yat library is distributed in the hope that it will be useful,
16  but WITHOUT ANY WARRANTY; without even the implied warranty of
17  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18  General Public License for more details.
19
20  You should have received a copy of the GNU General Public License
21  along with yat. If not, see <http://www.gnu.org/licenses/>.
22*/
23
24#include "utility.h"
25
26#include "yat/regression/CSplineInterpolation.h"
27#include "yat/utility/DataIterator.h"
28#include "yat/utility/DataWeight.h"
29#include "yat/utility/iterator_traits.h"
30#include "yat/utility/sort_index.h"
31#include "yat/utility/Vector.h"
32#include "yat/utility/WeightIterator.h"
33#include "yat/utility/yat_assert.h"
34
35#include <boost/concept_check.hpp>
36
37#include <algorithm>
38#include <cmath>
39#include <iterator>
40#include <limits>
41#include <numeric>
42#include <stdexcept>
43#include <vector>
44
45namespace theplu {
46namespace yat {
47namespace utility {
48  class VectorBase;
49}
50namespace normalizer {
51
52  /**
53     \brief Perform Q-quantile normalization
54
55     Perform a Q-quantile normalization on a \a source range, after
56     which it will approximately have the same distribution of data as
57     the \a target range (the Q-quantiles are the same). The rank of
58     an element in the \a source range is not changed.
59
60     The class works also with unweighed ranges, and there is no
61     restriction that weighted \a source range requires weighted \a
62     target range or vice versa.
63
64     Normalization goes like this:
65     - Data are not assumed to be sorted.
66     - Partition sorted \a target data in Q parts. Q must be 3 or larger
67       because of requirements from the underlying cubic spline fit
68     - Calculate the arithmetic (weighted) mean for each part, the mean is
69       assigned to the mid point of each part.
70     - Do the above for the data to be transformed (called \a source
71       here).
72     - For each part, calculate the difference between the \a target
73       and \a the source. Now we have \a Q differences \f$ d_i \f$
74       with associated rank (midpoint of each part).
75     - Create a cubic spline fit to this difference vector \a d. The
76       resulting curve is used to recalculate all values in \a source.
77       - Use the cubic spline fit for values within the cubic spline
78         fit range [midpoint 1st part, midpoint last part].
79       - For data outside the cubic spline fit use linear
80         extrapolation, i.e., a constant shift. \f$ d_{first} \f$ for
81         points below fit range, and \f$ d_{last} \f$ for points above fit
82         range.
83
84     \since New in yat 0.5
85   */
86  class qQuantileNormalizer
87  {
88  public:
89    /**
90       \brief Constructor
91
92       Divides a sorted copy of range [\a first,\a last) into \a Q
93       parts. Parts are divided such that the sum of weights is
94       approximately the same in the different parts. If a relative
95       weight, \f$ w_i / \sum w_i \f$, is larger than 1/Q this might
96       be difficult to achieve, in which case a an exception is
97       thrown. In the unweighted case this implies that \a Q should be
98       smaller (or equal) than number of elements in [\a first, \a
99       last).
100
101       The range supplied to the constructor sets the target
102       distribution.
103
104       As the \a source range is also divided into \a Q parts (when
105       operator() is called), it is recommended to keep \a Q smaller
106       (or equal) than the size of ranges that will be normalized.
107
108       Also, \a Q must not be smaller than 3 due to restrictions in
109       the cubic spline fit utilized in the normalization.
110
111       \b Type \b Requirements:
112       - \c ForwardIterator is a model of \forward_iterator.
113       - \c ForwardIterator is a \ref concept_data_iterator
114    */
115    template<typename ForwardIterator>
116    qQuantileNormalizer(ForwardIterator first, ForwardIterator last,
117                        unsigned int Q);
118
119    /**
120       \brief Perform the Q-quantile normalization.
121
122       Elements in [\a first, \a last) are normalized as described
123       above and the result is assigned to [\a result, \a result + \a
124       last-\a first). Input range [\a first, \a last) is not
125       modified. If ranges are weighted, the weights are copied from
126       [\a first, \a last) to \a result range.
127
128       It is possible to normalize "in place"; it is permissible for
129       \a first and \a result to be the same. However, as assignment
130       occurs sequentially, the operation is undefined if \a result is
131       the same as any in range [\a first, \a last).
132
133       \b Type Requirements:
134       - \c RandomAccessIterator1 is a model of \random_access_iterator
135       - \c RandomAccessIterator1 is a \ref concept_data_iterator
136       - \c RandomAccessIterator1's value type is convertible to
137         \c RandomAccessIterator2's value type
138       - \c RandomAccessIterator2 is a model of \random_access_iterator
139       - \c RandomAccessIterator2 is mutable.
140
141     */
142    template<typename RandomAccessIterator1, typename RandomAccessIterator2>
143    void operator()(RandomAccessIterator1 first, RandomAccessIterator1 last,
144                    RandomAccessIterator2 result) const;
145
146  private:
147
148  /**
149     \brief Partition a range of data into equal sizes.
150
151     Copy the range [first, last), sort the copy, and divide the
152     sorted copy in Q parts. The parts are created such that the total
153     weight in a part is approximately W/Q where W is the total weight
154     (over all parts). The class calculates the average value in each
155     part and also the "quantile".
156  */
157  class Partitioner
158  {
159  public:
160    /**
161       \brief Create the partition and perform required calculations.
162    */
163    template<typename ForwardIterator>
164    Partitioner(ForwardIterator first, ForwardIterator last, 
165                unsigned int N);
166
167    /**
168       \brief Return the averages for each part.
169
170       \return The average vector.
171    */
172    const utility::Vector& averages(void) const;
173
174    /**
175       The quantile (here) is defined as (w_lower + w_upper) / 2W,
176       where w_lower is the total weight of elements smaller than the
177       smallest element in the part, and w_upper is the total weight
178       of elements smaller (or equal) than the largest value in the
179       part. 
180
181       In the unweighted case all weights are 1.0, which implies q_0 =
182       n_0/N, q_1 = (n_0+n_1/2)/N, q_2 = (n_0+n_1+n_2/2)/N where n_i
183       is number of elements in ith part.
184
185       \return The quantiles vector.
186    */
187    const utility::Vector& quantiles(void) const;
188
189    /**
190       \return The number of parts.
191    */
192    size_t size(void) const;
193
194  private:
195    // unweighted "constructor"
196    template<typename ForwardIterator>
197    void build(ForwardIterator first, ForwardIterator last, unsigned int N, 
198               utility::unweighted_iterator_tag);
199    // weighted "constructor"
200    template<typename ForwardIterator>
201    void build(ForwardIterator first, ForwardIterator last, unsigned int N, 
202               utility::weighted_iterator_tag);
203    void init(const utility::VectorBase&, unsigned int N);
204    void init(const std::vector<utility::DataWeight>&, unsigned int N);
205
206    utility::Vector average_;
207    utility::Vector quantiles_;
208  };
209
210    Partitioner target_;
211
212    // unweighted version
213    template<typename RandomAccessIterator1, typename RandomAccessIterator2>
214    void normalize(const Partitioner& source,RandomAccessIterator1 first, 
215                   RandomAccessIterator1 last, RandomAccessIterator2 result,
216                   utility::unweighted_iterator_tag tag) const;
217
218    // weighted version
219    template<typename RandomAccessIterator1, typename RandomAccessIterator2>
220    void normalize(const Partitioner& source,RandomAccessIterator1 first, 
221                   RandomAccessIterator1 last, RandomAccessIterator2 result,
222                   utility::weighted_iterator_tag tag) const;
223  };
224
225
226  // template implementations
227
228  template<typename ForwardIterator>
229  qQuantileNormalizer::qQuantileNormalizer(ForwardIterator first, 
230                                           ForwardIterator last,
231                                           unsigned int Q)
232    : target_(Partitioner(first, last, Q))
233  {
234    utility::yat_assert<std::runtime_error>(Q>2,
235                                            "qQuantileNormalizer: Q too small");
236  }
237
238
239  template<typename RandomAccessIterator1, typename RandomAccessIterator2>
240  void qQuantileNormalizer::operator()(RandomAccessIterator1 first, 
241                                       RandomAccessIterator1 last,
242                                       RandomAccessIterator2 result) const
243  {
244    Partitioner source(first, last, target_.size());
245    typename utility::weighted_if_any2<RandomAccessIterator1, RandomAccessIterator2>::type tag;
246    normalize(source, first, last, result, tag);
247  }
248
249
250  template<typename RandomAccessIterator1, typename RandomAccessIterator2>
251  void 
252  qQuantileNormalizer::normalize(const qQuantileNormalizer::Partitioner& source,
253                                 RandomAccessIterator1 first, 
254                                 RandomAccessIterator1 last, 
255                                 RandomAccessIterator2 result,
256                                 utility::unweighted_iterator_tag tag) const
257  {
258    BOOST_CONCEPT_ASSERT((boost::RandomAccessIterator<RandomAccessIterator1>));
259    BOOST_CONCEPT_ASSERT((boost::Mutable_RandomAccessIterator<RandomAccessIterator2>));
260    utility::check_iterator_is_unweighted(first);
261    utility::check_iterator_is_unweighted(result);
262    size_t N = last-first;
263    utility::yat_assert<std::runtime_error>
264      (N >= target_.size(), "qQuantileNormalizer: Input range too small");
265
266    std::vector<size_t> sorted_index(last-first);
267    utility::sort_index(first, last, sorted_index);
268
269    utility::Vector diff(source.averages());
270    diff-=target_.averages();
271    const utility::Vector& idx=target_.quantiles();
272    regression::CSplineInterpolation cspline(idx,diff);
273
274    // linear extrapolation for first part, i.e., use first diff for
275    // all points in the first part.
276    size_t start=0;
277    size_t end = static_cast<size_t>(std::ceil(N*idx(0) - 0.5));
278    // take care of limiting case number of parts approximately equal
279    // to the number of elements in range.
280    if (end==0)
281      ++end;
282    for (size_t i=start; i<end; ++i) {
283      size_t si = sorted_index[i];
284      result[si] = first[si] - diff(0);
285    }
286
287    using utility::yat_assert;
288   
289    // cspline interpolation for all data between the mid points of
290    // the first and last part
291    start=end;
292    end = static_cast<size_t>(std::ceil(N*idx(idx.size()-1) - 0.5));
293    if (end>=N)
294      end = N-1;
295    for ( size_t i=start; i<end; ++i) {
296      size_t si = sorted_index[i];
297     
298      yat_assert<std::runtime_error>((i+0.5)/N>idx(0), 
299                               "qQuantileNormalizer: invalid input to cspline");
300      result[si] = first[si] - cspline.evaluate((i+0.5)/N);
301    }
302   
303    // linear extrapolation for last part, i.e., use last diff for
304    // all points in the last part.
305    for (size_t i=end ; i<N; ++i) {
306      size_t si = sorted_index[i];
307      result[si] = first[si] - diff(diff.size()-1);
308    }
309  }
310
311
312  template<typename RandomAccessIterator1, typename RandomAccessIterator2>
313  void qQuantileNormalizer::normalize(const Partitioner& source,
314                                      RandomAccessIterator1 first, 
315                                      RandomAccessIterator1 last, 
316                                      RandomAccessIterator2 result,
317                                      utility::weighted_iterator_tag tag) const
318  {
319    BOOST_CONCEPT_ASSERT((boost::RandomAccessIterator<RandomAccessIterator1>));
320    BOOST_CONCEPT_ASSERT((boost::Mutable_RandomAccessIterator<RandomAccessIterator2>));
321    // copy the weights
322    detail::copy_weight_if_weighted(first, last, result);
323
324    double total_w = std::accumulate(utility::weight_iterator(first),
325                                     utility::weight_iterator(last), 0.0);
326
327    std::vector<size_t> sorted_index(last-first);
328    // Code to avoid problems with NaN (ticket:535)
329    // utility::sort_index(utility::data_iterator(first),
330    //                     utility::data_iterator(last), sorted_index);
331    // ... above statement replaced below code block
332    {
333      using namespace utility;
334      std::vector<double> vec;
335      vec.reserve(std::distance(first, last));
336      std::copy(utility::data_iterator(first), utility::data_iterator(last), 
337                std::back_inserter(vec));
338      for (std::vector<double>::iterator i(vec.begin()); i!=vec.end(); ++i)
339        if (std::isnan(*i))
340          *i = std::numeric_limits<double>::infinity();
341      utility::sort_index(vec.begin(), vec.end(), sorted_index);
342    }
343    // end Code to avoid problems with NaN (ticket:535)
344
345    utility::Vector diff(source.averages());
346    diff-=target_.averages();
347    const utility::Vector& idx=target_.quantiles();
348    regression::CSplineInterpolation cspline(idx,diff);
349
350    double sum_w=0;
351    utility::iterator_traits<RandomAccessIterator1> traits1;
352    utility::iterator_traits<RandomAccessIterator2> traits2;
353    for (size_t i=0; i<sorted_index.size(); ++i) {
354      size_t si = sorted_index[i];
355      double w = (sum_w + 0.5*traits1.weight(first+si))/total_w;
356      double correction = 0;
357      if (w <= idx(0)) {
358        // linear extrapolation for first part, i.e., use first diff for
359        // all points in the first part.
360        correction = diff(0);
361      }
362      else if (w < idx(idx.size()-1) ) {
363        // cspline interpolation for all data between the mid points of
364        // the first and last part
365        correction = cspline.evaluate(w);
366      }
367      else {
368        // linear extrapolation for last part, i.e., use last diff for
369        // all points in the last part.
370        correction = diff(diff.size()-1);
371      }
372      traits2.data(result+si) = traits1.data(first+si) - correction;
373      sum_w += traits1.weight(first+si);
374    }
375  }
376
377
378  template<typename ForwardIterator>
379  qQuantileNormalizer::Partitioner::Partitioner(ForwardIterator first, 
380                                                ForwardIterator last, 
381                                                unsigned int N)
382    : average_(utility::Vector(N)), quantiles_(utility::Vector(N))
383  {
384    BOOST_CONCEPT_ASSERT((boost::ForwardIterator<ForwardIterator>));
385    build(first, last, N, 
386          typename utility::weighted_iterator_traits<ForwardIterator>::type());
387  }
388
389
390  template<typename ForwardIterator>
391  void qQuantileNormalizer::Partitioner::build(ForwardIterator first, 
392                                               ForwardIterator last, 
393                                               unsigned int N, 
394                                               utility::unweighted_iterator_tag)
395  {
396    utility::Vector vec(std::distance(first, last));
397    std::copy(first, last, vec.begin());
398    std::sort(vec.begin(), vec.end());
399    init(vec, N);
400  }
401
402
403  template<typename ForwardIterator>
404  void qQuantileNormalizer::Partitioner::build(ForwardIterator first, 
405                                               ForwardIterator last, 
406                                               unsigned int N, 
407                                               utility::weighted_iterator_tag)
408  {
409    using namespace utility;
410    std::vector<DataWeight> vec;
411    vec.reserve(std::distance(first, last));
412    std::back_insert_iterator<std::vector<DataWeight> > inserter(vec);
413    std::copy(first, last, inserter);
414    std::sort(vec.begin(), vec.end());
415    init(vec, N);
416  }
417
418
419}}} // end of namespace normalizer, yat and thep
420
421#endif
Note: See TracBrowser for help on using the repository browser.