source: trunk/yat/normalizer/qQuantileNormalizer.h @ 2141

Last change on this file since 2141 was 2141, checked in by Peter, 13 years ago

adding concept Data Iterator. fixes #514

  • Property svn:eol-style set to native
  • Property svn:keywords set to Id
File size: 14.2 KB
Line 
1#ifndef _theplu_yat_normalizer_qquantile_normalizer_
2#define _theplu_yat_normalizer_qquantile_normalizer_
3
4/*
5  Copyright (C) 2009 Jari Häkkinen, Peter Johansson
6  Copyright (C) 2010 Peter Johansson
7
8  This file is part of the yat library, http://dev.thep.lu.se/yat
9
10  The yat library is free software; you can redistribute it and/or
11  modify it under the terms of the GNU General Public License as
12  published by the Free Software Foundation; either version 3 of the
13  License, or (at your option) any later version.
14
15  The yat library is distributed in the hope that it will be useful,
16  but WITHOUT ANY WARRANTY; without even the implied warranty of
17  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18  General Public License for more details.
19
20  You should have received a copy of the GNU General Public License
21  along with yat. If not, see <http://www.gnu.org/licenses/>.
22*/
23
24#include "yat/regression/CSplineInterpolation.h"
25#include "yat/utility/DataIterator.h"
26#include "yat/utility/DataWeight.h"
27#include "yat/utility/iterator_traits.h"
28#include "yat/utility/sort_index.h"
29#include "yat/utility/Vector.h"
30#include "yat/utility/WeightIterator.h"
31#include "yat/utility/yat_assert.h"
32
33#include <algorithm>
34#include <cmath>
35#include <iterator>
36#include <limits>
37#include <numeric>
38#include <stdexcept>
39#include <vector>
40
41namespace theplu {
42namespace yat {
43namespace utility {
44  class VectorBase;
45}
46namespace normalizer {
47
48  /**
49     \brief Perform Q-quantile normalization
50
51     Perform a Q-quantile normalization on a \a source range, after
52     which it will approximately have the same distribution of data as
53     the \a target range (the Q-quantiles are the same). The rank of
54     an element in the \a source range is not changed.
55
56     The class works also with unweighed ranges, and there is no
57     restriction that weighted \a source range requires weighted \a
58     target range or vice versa.
59
60     Normalization goes like this:
61     - Data are not assumed to be sorted.
62     - Partition sorted \a target data in Q parts. Q must be 3 or larger
63       because of requirements from the underlying cubic spline fit
64     - Calculate the arithmetic (weighted) mean for each part, the mean is
65       assigned to the mid point of each part.
66     - Do the above for the data to be transformed (called \a source
67       here).
68     - For each part, calculate the difference between the \a target
69       and \a the source. Now we have \a Q differences \f$ d_i \f$
70       with associated rank (midpoint of each part).
71     - Create a cubic spline fit to this difference vector \a d. The
72       resulting curve is used to recalculate all values in \a source.
73       - Use the cubic spline fit for values within the cubic spline
74         fit range [midpoint 1st part, midpoint last part].
75       - For data outside the cubic spline fit use linear
76         extrapolation, i.e., a constant shift. \f$ d_{first} \f$ for
77         points below fit range, and \f$ d_{last} \f$ for points above fit
78         range.
79
80     \since New in yat 0.5
81   */
82  class qQuantileNormalizer
83  {
84  public:
85    /**
86       \brief Constructor
87
88       Divides a sorted copy of range [\a first,\a last) into \a Q
89       parts. Parts are divided such that the sum of weights is
90       approximately the same in the different parts. If a relative
91       weight, \f$ w_i / \sum w_i \f$, is larger than 1/Q this might
92       be difficult to achieve, in which case a an exception is
93       thrown. In the unweighted case this implies that \a Q should be
94       smaller (or equal) than number of elements in [\a first, \a
95       last).
96
97       The range supplied to the constructor sets the target
98       distribution.
99
100       As the \a source range is also divided into \a Q parts (when
101       operator() is called), it is recommended to keep \a Q smaller
102       (or equal) than the size of ranges that will be normalized.
103
104       Also, \a Q must not be smaller than 3 due to restrictions in
105       the cubic spline fit utilized in the normalization.
106
107       \b Type \b Requirements:
108       - \c ForwardIterator is a model of \forward_iterator.
109       - \c ForwardIterator is a \ref concept_data_iterator
110    */
111    template<typename ForwardIterator>
112    qQuantileNormalizer(ForwardIterator first, ForwardIterator last,
113                        unsigned int Q);
114
115    /**
116       \brief Perform the Q-quantile normalization.
117
118       Elements in [\a first, \a last) are normalized as described
119       above and the result is assigned to [\a result, \a result + \a
120       last-\a first). Input range [\a first, \a last) is not
121       modified. If ranges are weighted, the weights are copied from
122       [\a first, \a last) to \a result range.
123
124       It is possible to normalize "in place"; it is permissible for
125       \a first and \a result to be the same. However, as assignment
126       occurs sequentially, the operation is undefined if \a result is
127       the same as any in range [\a first, \a last).
128
129       \b Type Requirements:
130       - \c RandomAccessIterator1 is a model of \random_access_iterator
131       - \c RandomAccessIterator1 is a \ref concept_data_iterator
132       - \c RandomAccessIterator1's value type is convertible to
133         \c RandomAccessIterator2's value type
134       - \c RandomAccessIterator2 is a model of \random_access_iterator
135       - \c RandomAccessIterator2 is mutable.
136
137     */
138    template<typename RandomAccessIterator1, typename RandomAccessIterator2>
139    void operator()(RandomAccessIterator1 first, RandomAccessIterator1 last,
140                    RandomAccessIterator2 result) const;
141
142  private:
143
144  /**
145     \brief Partition a range of data into equal sizes.
146
147     Copy the range [first, last), sort the copy, and divide the
148     sorted copy in Q parts. The parts are created such that the total
149     weight in a part is approximately W/Q where W is the total weight
150     (over all parts). The class calculates the average value in each
151     part and also the "quantile".
152  */
153  class Partitioner
154  {
155  public:
156    /**
157       \brief Create the partition and perform required calculations.
158    */
159    template<typename ForwardIterator>
160    Partitioner(ForwardIterator first, ForwardIterator last, 
161                unsigned int N);
162
163    /**
164       \brief Return the averages for each part.
165
166       \return The average vector.
167    */
168    const utility::Vector& averages(void) const;
169
170    /**
171       The quantile (here) is defined as (w_lower + w_upper) / 2W,
172       where w_lower is the total weight of elements smaller than the
173       smallest element in the part, and w_upper is the total weight
174       of elements smaller (or equal) than the largest value in the
175       part. 
176
177       In the unweighted case all weights are 1.0, which implies q_0 =
178       n_0/N, q_1 = (n_0+n_1/2)/N, q_2 = (n_0+n_1+n_2/2)/N where n_i
179       is number of elements in ith part.
180
181       \return The quantiles vector.
182    */
183    const utility::Vector& quantiles(void) const;
184
185    /**
186       \return The number of parts.
187    */
188    size_t size(void) const;
189
190  private:
191    // unweighted "constructor"
192    template<typename ForwardIterator>
193    void build(ForwardIterator first, ForwardIterator last, unsigned int N, 
194               utility::unweighted_iterator_tag);
195    // weighted "constructor"
196    template<typename ForwardIterator>
197    void build(ForwardIterator first, ForwardIterator last, unsigned int N, 
198               utility::weighted_iterator_tag);
199    void init(const utility::VectorBase&, unsigned int N);
200    void init(const std::vector<utility::DataWeight>&, unsigned int N);
201
202    utility::Vector average_;
203    utility::Vector quantiles_;
204  };
205
206    Partitioner target_;
207
208    // unweighted version
209    template<typename RandomAccessIterator1, typename RandomAccessIterator2>
210    void normalize(const Partitioner& source,RandomAccessIterator1 first, 
211                   RandomAccessIterator1 last, RandomAccessIterator2 result,
212                   utility::unweighted_iterator_tag tag) const;
213
214    // weighted version
215    template<typename RandomAccessIterator1, typename RandomAccessIterator2>
216    void normalize(const Partitioner& source,RandomAccessIterator1 first, 
217                   RandomAccessIterator1 last, RandomAccessIterator2 result,
218                   utility::weighted_iterator_tag tag) const;
219  };
220
221
222  // template implementations
223
224  template<typename ForwardIterator>
225  qQuantileNormalizer::qQuantileNormalizer(ForwardIterator first, 
226                                           ForwardIterator last,
227                                           unsigned int Q)
228    : target_(Partitioner(first, last, Q))
229  {
230    utility::yat_assert<std::runtime_error>(Q>2,
231                                            "qQuantileNormalizer: Q too small");
232  }
233
234
235  template<typename RandomAccessIterator1, typename RandomAccessIterator2>
236  void qQuantileNormalizer::operator()(RandomAccessIterator1 first, 
237                                       RandomAccessIterator1 last,
238                                       RandomAccessIterator2 result) const
239  {
240    Partitioner source(first, last, target_.size());
241    typename utility::weighted_iterator_traits<RandomAccessIterator2>::type tag;
242    normalize(source, first, last, result, tag);
243  }
244
245
246  template<typename RandomAccessIterator1, typename RandomAccessIterator2>
247  void 
248  qQuantileNormalizer::normalize(const qQuantileNormalizer::Partitioner& source,
249                                 RandomAccessIterator1 first, 
250                                 RandomAccessIterator1 last, 
251                                 RandomAccessIterator2 result,
252                                 utility::unweighted_iterator_tag tag) const
253  {
254    utility::check_iterator_is_unweighted(first);
255    utility::check_iterator_is_unweighted(result);
256    size_t N = last-first;
257    utility::yat_assert<std::runtime_error>
258      (N >= target_.size(), "qQuantileNormalizer: Input range too small");
259
260    std::vector<size_t> sorted_index(last-first);
261    utility::sort_index(first, last, sorted_index);
262
263    utility::Vector diff(source.averages());
264    diff-=target_.averages();
265    const utility::Vector& idx=target_.quantiles();
266    regression::CSplineInterpolation cspline(idx,diff);
267
268    // linear extrapolation for first part, i.e., use first diff for
269    // all points in the first part.
270    size_t start=0;
271    size_t end = static_cast<size_t>(std::ceil(N*idx(0) - 0.5));
272    // take care of limiting case number of parts approximately equal
273    // to the number of elements in range.
274    if (end==0)
275      ++end;
276    for (size_t i=start; i<end; ++i) {
277      size_t si = sorted_index[i];
278      result[si] = first[si] - diff(0);
279    }
280
281    using utility::yat_assert;
282   
283    // cspline interpolation for all data between the mid points of
284    // the first and last part
285    start=end;
286    end = static_cast<size_t>(std::ceil(N*idx(idx.size()-1) - 0.5));
287    if (end>=N)
288      end = N-1;
289    for ( size_t i=start; i<end; ++i) {
290      size_t si = sorted_index[i];
291     
292      yat_assert<std::runtime_error>((i+0.5)/N>idx(0), 
293                               "qQuantileNormalizer: invalid input to cspline");
294      result[si] = first[si] - cspline.evaluate((i+0.5)/N);
295    }
296   
297    // linear extrapolation for last part, i.e., use last diff for
298    // all points in the last part.
299    for (size_t i=end ; i<N; ++i) {
300      size_t si = sorted_index[i];
301      result[si] = first[si] - diff(diff.size()-1);
302    }
303  }
304
305
306  template<typename RandomAccessIterator1, typename RandomAccessIterator2>
307  void qQuantileNormalizer::normalize(const Partitioner& source,
308                                      RandomAccessIterator1 first, 
309                                      RandomAccessIterator1 last, 
310                                      RandomAccessIterator2 result,
311                                      utility::weighted_iterator_tag tag) const
312  {
313    // copy the weights
314    std::copy(utility::weight_iterator(first), 
315              utility::weight_iterator(last), 
316              utility::weight_iterator(result)); 
317
318    double total_w = std::accumulate(utility::weight_iterator(first),
319                                     utility::weight_iterator(last), 0.0);
320
321    std::vector<size_t> sorted_index(last-first);
322    // Code to avoid problems with NaN (ticket:535)
323    // utility::sort_index(utility::data_iterator(first),
324    //                     utility::data_iterator(last), sorted_index);
325    // ... above statement replaced below code block
326    {
327      using namespace utility;
328      std::vector<double> vec;
329      vec.reserve(std::distance(first, last));
330      std::copy(utility::data_iterator(first), utility::data_iterator(last), 
331                std::back_inserter(vec));
332      for (std::vector<double>::iterator i(vec.begin()); i!=vec.end(); ++i)
333        if (std::isnan(*i))
334          *i = std::numeric_limits<double>::infinity();
335      utility::sort_index(vec.begin(), vec.end(), sorted_index);
336    }
337    // end Code to avoid problems with NaN (ticket:535)
338
339    utility::Vector diff(source.averages());
340    diff-=target_.averages();
341    const utility::Vector& idx=target_.quantiles();
342    regression::CSplineInterpolation cspline(idx,diff);
343
344    double sum_w=0;
345    utility::iterator_traits<RandomAccessIterator1> traits1;
346    utility::iterator_traits<RandomAccessIterator2> traits2;
347    for (size_t i=0; i<sorted_index.size(); ++i) {
348      size_t si = sorted_index[i];
349      double w = (sum_w + 0.5*traits1.weight(first+si))/total_w;
350      double correction = 0;
351      if (w <= idx(0)) {
352        // linear extrapolation for first part, i.e., use first diff for
353        // all points in the first part.
354        correction = diff(0);
355      }
356      else if (w < idx(idx.size()-1) ) {
357        // cspline interpolation for all data between the mid points of
358        // the first and last part
359        correction = cspline.evaluate(w);
360      }
361      else {
362        // linear extrapolation for last part, i.e., use last diff for
363        // all points in the last part.
364        correction = diff(diff.size()-1);
365      }
366      traits2.data(result+si) = traits1.data(first+si) - correction;
367      sum_w += traits1.weight(first+si);
368    }
369  }
370
371
372  template<typename ForwardIterator>
373  qQuantileNormalizer::Partitioner::Partitioner(ForwardIterator first, 
374                                                ForwardIterator last, 
375                                                unsigned int N)
376    : average_(utility::Vector(N)), quantiles_(utility::Vector(N))
377  {
378    build(first, last, N, 
379          typename utility::weighted_iterator_traits<ForwardIterator>::type());
380  }
381
382
383  template<typename ForwardIterator>
384  void qQuantileNormalizer::Partitioner::build(ForwardIterator first, 
385                                               ForwardIterator last, 
386                                               unsigned int N, 
387                                               utility::unweighted_iterator_tag)
388  {
389    utility::Vector vec(std::distance(first, last));
390    std::copy(first, last, vec.begin());
391    std::sort(vec.begin(), vec.end());
392    init(vec, N);
393  }
394
395
396  template<typename ForwardIterator>
397  void qQuantileNormalizer::Partitioner::build(ForwardIterator first, 
398                                               ForwardIterator last, 
399                                               unsigned int N, 
400                                               utility::weighted_iterator_tag)
401  {
402    using namespace utility;
403    std::vector<DataWeight> vec;
404    vec.reserve(std::distance(first, last));
405    std::back_insert_iterator<std::vector<DataWeight> > inserter(vec);
406    std::copy(first, last, inserter);
407    std::sort(vec.begin(), vec.end());
408    init(vec, N);
409  }
410
411
412}}} // end of namespace normalizer, yat and thep
413
414#endif
Note: See TracBrowser for help on using the repository browser.