source: trunk/yat/normalizer/qQuantileNormalizer.h @ 2154

Last change on this file since 2154 was 2154, checked in by Peter, 13 years ago

using BOOST_CONCEPT_ASSERT. closes #603

  • Property svn:eol-style set to native
  • Property svn:keywords set to Id
File size: 14.6 KB
Line 
1#ifndef _theplu_yat_normalizer_qquantile_normalizer_
2#define _theplu_yat_normalizer_qquantile_normalizer_
3
4/*
5  Copyright (C) 2009 Jari Häkkinen, Peter Johansson
6  Copyright (C) 2010 Peter Johansson
7
8  This file is part of the yat library, http://dev.thep.lu.se/yat
9
10  The yat library is free software; you can redistribute it and/or
11  modify it under the terms of the GNU General Public License as
12  published by the Free Software Foundation; either version 3 of the
13  License, or (at your option) any later version.
14
15  The yat library is distributed in the hope that it will be useful,
16  but WITHOUT ANY WARRANTY; without even the implied warranty of
17  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18  General Public License for more details.
19
20  You should have received a copy of the GNU General Public License
21  along with yat. If not, see <http://www.gnu.org/licenses/>.
22*/
23
24#include "yat/regression/CSplineInterpolation.h"
25#include "yat/utility/DataIterator.h"
26#include "yat/utility/DataWeight.h"
27#include "yat/utility/iterator_traits.h"
28#include "yat/utility/sort_index.h"
29#include "yat/utility/Vector.h"
30#include "yat/utility/WeightIterator.h"
31#include "yat/utility/yat_assert.h"
32
33#include <boost/concept_check.hpp>
34
35#include <algorithm>
36#include <cmath>
37#include <iterator>
38#include <limits>
39#include <numeric>
40#include <stdexcept>
41#include <vector>
42
43namespace theplu {
44namespace yat {
45namespace utility {
46  class VectorBase;
47}
48namespace normalizer {
49
50  /**
51     \brief Perform Q-quantile normalization
52
53     Perform a Q-quantile normalization on a \a source range, after
54     which it will approximately have the same distribution of data as
55     the \a target range (the Q-quantiles are the same). The rank of
56     an element in the \a source range is not changed.
57
58     The class works also with unweighed ranges, and there is no
59     restriction that weighted \a source range requires weighted \a
60     target range or vice versa.
61
62     Normalization goes like this:
63     - Data are not assumed to be sorted.
64     - Partition sorted \a target data in Q parts. Q must be 3 or larger
65       because of requirements from the underlying cubic spline fit
66     - Calculate the arithmetic (weighted) mean for each part, the mean is
67       assigned to the mid point of each part.
68     - Do the above for the data to be transformed (called \a source
69       here).
70     - For each part, calculate the difference between the \a target
71       and \a the source. Now we have \a Q differences \f$ d_i \f$
72       with associated rank (midpoint of each part).
73     - Create a cubic spline fit to this difference vector \a d. The
74       resulting curve is used to recalculate all values in \a source.
75       - Use the cubic spline fit for values within the cubic spline
76         fit range [midpoint 1st part, midpoint last part].
77       - For data outside the cubic spline fit use linear
78         extrapolation, i.e., a constant shift. \f$ d_{first} \f$ for
79         points below fit range, and \f$ d_{last} \f$ for points above fit
80         range.
81
82     \since New in yat 0.5
83   */
84  class qQuantileNormalizer
85  {
86  public:
87    /**
88       \brief Constructor
89
90       Divides a sorted copy of range [\a first,\a last) into \a Q
91       parts. Parts are divided such that the sum of weights is
92       approximately the same in the different parts. If a relative
93       weight, \f$ w_i / \sum w_i \f$, is larger than 1/Q this might
94       be difficult to achieve, in which case a an exception is
95       thrown. In the unweighted case this implies that \a Q should be
96       smaller (or equal) than number of elements in [\a first, \a
97       last).
98
99       The range supplied to the constructor sets the target
100       distribution.
101
102       As the \a source range is also divided into \a Q parts (when
103       operator() is called), it is recommended to keep \a Q smaller
104       (or equal) than the size of ranges that will be normalized.
105
106       Also, \a Q must not be smaller than 3 due to restrictions in
107       the cubic spline fit utilized in the normalization.
108
109       \b Type \b Requirements:
110       - \c ForwardIterator is a model of \forward_iterator.
111       - \c ForwardIterator is a \ref concept_data_iterator
112    */
113    template<typename ForwardIterator>
114    qQuantileNormalizer(ForwardIterator first, ForwardIterator last,
115                        unsigned int Q);
116
117    /**
118       \brief Perform the Q-quantile normalization.
119
120       Elements in [\a first, \a last) are normalized as described
121       above and the result is assigned to [\a result, \a result + \a
122       last-\a first). Input range [\a first, \a last) is not
123       modified. If ranges are weighted, the weights are copied from
124       [\a first, \a last) to \a result range.
125
126       It is possible to normalize "in place"; it is permissible for
127       \a first and \a result to be the same. However, as assignment
128       occurs sequentially, the operation is undefined if \a result is
129       the same as any in range [\a first, \a last).
130
131       \b Type Requirements:
132       - \c RandomAccessIterator1 is a model of \random_access_iterator
133       - \c RandomAccessIterator1 is a \ref concept_data_iterator
134       - \c RandomAccessIterator1's value type is convertible to
135         \c RandomAccessIterator2's value type
136       - \c RandomAccessIterator2 is a model of \random_access_iterator
137       - \c RandomAccessIterator2 is mutable.
138
139     */
140    template<typename RandomAccessIterator1, typename RandomAccessIterator2>
141    void operator()(RandomAccessIterator1 first, RandomAccessIterator1 last,
142                    RandomAccessIterator2 result) const;
143
144  private:
145
146  /**
147     \brief Partition a range of data into equal sizes.
148
149     Copy the range [first, last), sort the copy, and divide the
150     sorted copy in Q parts. The parts are created such that the total
151     weight in a part is approximately W/Q where W is the total weight
152     (over all parts). The class calculates the average value in each
153     part and also the "quantile".
154  */
155  class Partitioner
156  {
157  public:
158    /**
159       \brief Create the partition and perform required calculations.
160    */
161    template<typename ForwardIterator>
162    Partitioner(ForwardIterator first, ForwardIterator last, 
163                unsigned int N);
164
165    /**
166       \brief Return the averages for each part.
167
168       \return The average vector.
169    */
170    const utility::Vector& averages(void) const;
171
172    /**
173       The quantile (here) is defined as (w_lower + w_upper) / 2W,
174       where w_lower is the total weight of elements smaller than the
175       smallest element in the part, and w_upper is the total weight
176       of elements smaller (or equal) than the largest value in the
177       part. 
178
179       In the unweighted case all weights are 1.0, which implies q_0 =
180       n_0/N, q_1 = (n_0+n_1/2)/N, q_2 = (n_0+n_1+n_2/2)/N where n_i
181       is number of elements in ith part.
182
183       \return The quantiles vector.
184    */
185    const utility::Vector& quantiles(void) const;
186
187    /**
188       \return The number of parts.
189    */
190    size_t size(void) const;
191
192  private:
193    // unweighted "constructor"
194    template<typename ForwardIterator>
195    void build(ForwardIterator first, ForwardIterator last, unsigned int N, 
196               utility::unweighted_iterator_tag);
197    // weighted "constructor"
198    template<typename ForwardIterator>
199    void build(ForwardIterator first, ForwardIterator last, unsigned int N, 
200               utility::weighted_iterator_tag);
201    void init(const utility::VectorBase&, unsigned int N);
202    void init(const std::vector<utility::DataWeight>&, unsigned int N);
203
204    utility::Vector average_;
205    utility::Vector quantiles_;
206  };
207
208    Partitioner target_;
209
210    // unweighted version
211    template<typename RandomAccessIterator1, typename RandomAccessIterator2>
212    void normalize(const Partitioner& source,RandomAccessIterator1 first, 
213                   RandomAccessIterator1 last, RandomAccessIterator2 result,
214                   utility::unweighted_iterator_tag tag) const;
215
216    // weighted version
217    template<typename RandomAccessIterator1, typename RandomAccessIterator2>
218    void normalize(const Partitioner& source,RandomAccessIterator1 first, 
219                   RandomAccessIterator1 last, RandomAccessIterator2 result,
220                   utility::weighted_iterator_tag tag) const;
221  };
222
223
224  // template implementations
225
226  template<typename ForwardIterator>
227  qQuantileNormalizer::qQuantileNormalizer(ForwardIterator first, 
228                                           ForwardIterator last,
229                                           unsigned int Q)
230    : target_(Partitioner(first, last, Q))
231  {
232    utility::yat_assert<std::runtime_error>(Q>2,
233                                            "qQuantileNormalizer: Q too small");
234  }
235
236
237  template<typename RandomAccessIterator1, typename RandomAccessIterator2>
238  void qQuantileNormalizer::operator()(RandomAccessIterator1 first, 
239                                       RandomAccessIterator1 last,
240                                       RandomAccessIterator2 result) const
241  {
242    Partitioner source(first, last, target_.size());
243    typename utility::weighted_iterator_traits<RandomAccessIterator2>::type tag;
244    normalize(source, first, last, result, tag);
245  }
246
247
248  template<typename RandomAccessIterator1, typename RandomAccessIterator2>
249  void 
250  qQuantileNormalizer::normalize(const qQuantileNormalizer::Partitioner& source,
251                                 RandomAccessIterator1 first, 
252                                 RandomAccessIterator1 last, 
253                                 RandomAccessIterator2 result,
254                                 utility::unweighted_iterator_tag tag) const
255  {
256    BOOST_CONCEPT_ASSERT((boost::RandomAccessIterator<RandomAccessIterator1>));
257    BOOST_CONCEPT_ASSERT((boost::Mutable_RandomAccessIterator<RandomAccessIterator2>));
258    utility::check_iterator_is_unweighted(first);
259    utility::check_iterator_is_unweighted(result);
260    size_t N = last-first;
261    utility::yat_assert<std::runtime_error>
262      (N >= target_.size(), "qQuantileNormalizer: Input range too small");
263
264    std::vector<size_t> sorted_index(last-first);
265    utility::sort_index(first, last, sorted_index);
266
267    utility::Vector diff(source.averages());
268    diff-=target_.averages();
269    const utility::Vector& idx=target_.quantiles();
270    regression::CSplineInterpolation cspline(idx,diff);
271
272    // linear extrapolation for first part, i.e., use first diff for
273    // all points in the first part.
274    size_t start=0;
275    size_t end = static_cast<size_t>(std::ceil(N*idx(0) - 0.5));
276    // take care of limiting case number of parts approximately equal
277    // to the number of elements in range.
278    if (end==0)
279      ++end;
280    for (size_t i=start; i<end; ++i) {
281      size_t si = sorted_index[i];
282      result[si] = first[si] - diff(0);
283    }
284
285    using utility::yat_assert;
286   
287    // cspline interpolation for all data between the mid points of
288    // the first and last part
289    start=end;
290    end = static_cast<size_t>(std::ceil(N*idx(idx.size()-1) - 0.5));
291    if (end>=N)
292      end = N-1;
293    for ( size_t i=start; i<end; ++i) {
294      size_t si = sorted_index[i];
295     
296      yat_assert<std::runtime_error>((i+0.5)/N>idx(0), 
297                               "qQuantileNormalizer: invalid input to cspline");
298      result[si] = first[si] - cspline.evaluate((i+0.5)/N);
299    }
300   
301    // linear extrapolation for last part, i.e., use last diff for
302    // all points in the last part.
303    for (size_t i=end ; i<N; ++i) {
304      size_t si = sorted_index[i];
305      result[si] = first[si] - diff(diff.size()-1);
306    }
307  }
308
309
310  template<typename RandomAccessIterator1, typename RandomAccessIterator2>
311  void qQuantileNormalizer::normalize(const Partitioner& source,
312                                      RandomAccessIterator1 first, 
313                                      RandomAccessIterator1 last, 
314                                      RandomAccessIterator2 result,
315                                      utility::weighted_iterator_tag tag) const
316  {
317    BOOST_CONCEPT_ASSERT((boost::RandomAccessIterator<RandomAccessIterator1>));
318    BOOST_CONCEPT_ASSERT((boost::Mutable_RandomAccessIterator<RandomAccessIterator2>));
319    // copy the weights
320    std::copy(utility::weight_iterator(first), 
321              utility::weight_iterator(last), 
322              utility::weight_iterator(result)); 
323
324    double total_w = std::accumulate(utility::weight_iterator(first),
325                                     utility::weight_iterator(last), 0.0);
326
327    std::vector<size_t> sorted_index(last-first);
328    // Code to avoid problems with NaN (ticket:535)
329    // utility::sort_index(utility::data_iterator(first),
330    //                     utility::data_iterator(last), sorted_index);
331    // ... above statement replaced below code block
332    {
333      using namespace utility;
334      std::vector<double> vec;
335      vec.reserve(std::distance(first, last));
336      std::copy(utility::data_iterator(first), utility::data_iterator(last), 
337                std::back_inserter(vec));
338      for (std::vector<double>::iterator i(vec.begin()); i!=vec.end(); ++i)
339        if (std::isnan(*i))
340          *i = std::numeric_limits<double>::infinity();
341      utility::sort_index(vec.begin(), vec.end(), sorted_index);
342    }
343    // end Code to avoid problems with NaN (ticket:535)
344
345    utility::Vector diff(source.averages());
346    diff-=target_.averages();
347    const utility::Vector& idx=target_.quantiles();
348    regression::CSplineInterpolation cspline(idx,diff);
349
350    double sum_w=0;
351    utility::iterator_traits<RandomAccessIterator1> traits1;
352    utility::iterator_traits<RandomAccessIterator2> traits2;
353    for (size_t i=0; i<sorted_index.size(); ++i) {
354      size_t si = sorted_index[i];
355      double w = (sum_w + 0.5*traits1.weight(first+si))/total_w;
356      double correction = 0;
357      if (w <= idx(0)) {
358        // linear extrapolation for first part, i.e., use first diff for
359        // all points in the first part.
360        correction = diff(0);
361      }
362      else if (w < idx(idx.size()-1) ) {
363        // cspline interpolation for all data between the mid points of
364        // the first and last part
365        correction = cspline.evaluate(w);
366      }
367      else {
368        // linear extrapolation for last part, i.e., use last diff for
369        // all points in the last part.
370        correction = diff(diff.size()-1);
371      }
372      traits2.data(result+si) = traits1.data(first+si) - correction;
373      sum_w += traits1.weight(first+si);
374    }
375  }
376
377
378  template<typename ForwardIterator>
379  qQuantileNormalizer::Partitioner::Partitioner(ForwardIterator first, 
380                                                ForwardIterator last, 
381                                                unsigned int N)
382    : average_(utility::Vector(N)), quantiles_(utility::Vector(N))
383  {
384    BOOST_CONCEPT_ASSERT((boost::ForwardIterator<ForwardIterator>));
385    build(first, last, N, 
386          typename utility::weighted_iterator_traits<ForwardIterator>::type());
387  }
388
389
390  template<typename ForwardIterator>
391  void qQuantileNormalizer::Partitioner::build(ForwardIterator first, 
392                                               ForwardIterator last, 
393                                               unsigned int N, 
394                                               utility::unweighted_iterator_tag)
395  {
396    utility::Vector vec(std::distance(first, last));
397    std::copy(first, last, vec.begin());
398    std::sort(vec.begin(), vec.end());
399    init(vec, N);
400  }
401
402
403  template<typename ForwardIterator>
404  void qQuantileNormalizer::Partitioner::build(ForwardIterator first, 
405                                               ForwardIterator last, 
406                                               unsigned int N, 
407                                               utility::weighted_iterator_tag)
408  {
409    using namespace utility;
410    std::vector<DataWeight> vec;
411    vec.reserve(std::distance(first, last));
412    std::back_insert_iterator<std::vector<DataWeight> > inserter(vec);
413    std::copy(first, last, inserter);
414    std::sort(vec.begin(), vec.end());
415    init(vec, N);
416  }
417
418
419}}} // end of namespace normalizer, yat and thep
420
421#endif
Note: See TracBrowser for help on using the repository browser.