source: trunk/yat/normalizer/qQuantileNormalizer.h @ 2119

Last change on this file since 2119 was 2119, checked in by Peter, 13 years ago

converted files to utf-8. fixes #577

  • Property svn:eol-style set to native
  • Property svn:keywords set to Id
File size: 14.4 KB
Line 
1#ifndef _theplu_yat_normalizer_qquantile_normalizer_
2#define _theplu_yat_normalizer_qquantile_normalizer_
3
4/*
5  Copyright (C) 2009 Jari Häkkinen, Peter Johansson
6
7  This file is part of the yat library, http://dev.thep.lu.se/yat
8
9  The yat library is free software; you can redistribute it and/or
10  modify it under the terms of the GNU General Public License as
11  published by the Free Software Foundation; either version 3 of the
12  License, or (at your option) any later version.
13
14  The yat library is distributed in the hope that it will be useful,
15  but WITHOUT ANY WARRANTY; without even the implied warranty of
16  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17  General Public License for more details.
18
19  You should have received a copy of the GNU General Public License
20  along with yat. If not, see <http://www.gnu.org/licenses/>.
21*/
22
23#include "yat/regression/CSplineInterpolation.h"
24#include "yat/utility/DataIterator.h"
25#include "yat/utility/DataWeight.h"
26#include "yat/utility/iterator_traits.h"
27#include "yat/utility/sort_index.h"
28#include "yat/utility/Vector.h"
29#include "yat/utility/WeightIterator.h"
30#include "yat/utility/yat_assert.h"
31
32#include <algorithm>
33#include <cmath>
34#include <iterator>
35#include <limits>
36#include <numeric>
37#include <stdexcept>
38#include <vector>
39
40namespace theplu {
41namespace yat {
42namespace utility {
43  class VectorBase;
44}
45namespace normalizer {
46
47  /**
48     \brief Perform Q-quantile normalization
49
50     Perform a Q-quantile normalization on a \a source range, after
51     which it will approximately have the same distribution of data as
52     the \a target range (the Q-quantiles are the same). The rank of
53     an element in the \a source range is not changed.
54
55     The class works also with unweighed ranges, and there is no
56     restriction that weighted \a source range requires weighted \a
57     target range or vice versa.
58
59     Normalization goes like this:
60     - Data are not assumed to be sorted.
61     - Partition sorted \a target data in Q parts. Q must be 3 or larger
62       because of requirements from the underlying cubic spline fit
63     - Calculate the arithmetic (weighted) mean for each part, the mean is
64       assigned to the mid point of each part.
65     - Do the above for the data to be transformed (called \a source
66       here).
67     - For each part, calculate the difference between the \a target
68       and \a the source. Now we have \a Q differences \f$ d_i \f$
69       with associated rank (midpoint of each part).
70     - Create a cubic spline fit to this difference vector \a d. The
71       resulting curve is used to recalculate all values in \a source.
72       - Use the cubic spline fit for values within the cubic spline
73         fit range [midpoint 1st part, midpoint last part].
74       - For data outside the cubic spline fit use linear
75         extrapolation, i.e., a constant shift. \f$ d_{first} \f$ for
76         points below fit range, and \f$ d_{last} \f$ for points above fit
77         range.
78
79     \since New in yat 0.5
80   */
81  class qQuantileNormalizer
82  {
83  public:
84    /**
85       \brief Constructor
86
87       Divides a sorted copy of range [\a first,\a last) into \a Q
88       parts. Parts are divided such that the sum of weights is
89       approximately the same in the different parts. If a relative
90       weight, \f$ w_i / \sum w_i \f$, is larger than 1/Q this might
91       be difficult to achieve, in which case a an exception is
92       thrown. In the unweighted case this implies that \a Q should be
93       smaller (or equal) than number of elements in [\a first, \a
94       last).
95
96       The range supplied to the constructor sets the target
97       distribution.
98
99       As the \a source range is also divided into \a Q parts (when
100       operator() is called), it is recommended to keep \a Q smaller
101       (or equal) than the size of ranges that will be normalized.
102
103       Also, \a Q must not be smaller than 3 due to restrictions in
104       the cubic spline fit utilized in the normalization.
105
106       \b Type \b Requirements:
107       - \c ForwardIterator is a model of \forward_iterator.
108       - \c ForwardIterator is either a \ref concept_weighted_iterator
109         or unweighted with value type convertible to \c double.
110
111    */
112    template<typename ForwardIterator>
113    qQuantileNormalizer(ForwardIterator first, ForwardIterator last,
114                        unsigned int Q);
115
116    /**
117       \brief Perform the Q-quantile normalization.
118
119       Elements in [\a first, \a last) are normalized as described
120       above and the result is assigned to [\a result, \a result + \a
121       last-\a first). Input range [\a first, \a last) is not
122       modified. If ranges are weighted, the weights are copied from
123       [\a first, \a last) to \a result range.
124
125       It is possible to normalize "in place"; it is permissible for
126       \a first and \a result to be the same. However, as assignment
127       occurs sequentially, the operation is undefined if \a result is
128       the same as any in range [\a first, \a last).
129
130       \b Type Requirements:
131       - \c RandomAccessIterator1 is a model of \random_access_iterator
132       - \c RandomAccessIterator1 is either a \ref
133         concept_weighted_iterator or unweighted with value type
134         convertible to \c double.
135       - \c RandomAccessIterator1's value type is convertible to
136         \c RandomAccessIterator2's value type
137       - \c RandomAccessIterator2 is a model of \random_access_iterator
138       - \c RandomAccessIterator2 is mutable.
139
140     */
141    template<typename RandomAccessIterator1, typename RandomAccessIterator2>
142    void operator()(RandomAccessIterator1 first, RandomAccessIterator1 last,
143                    RandomAccessIterator2 result) const;
144
145  private:
146
147  /**
148     \brief Partition a range of data into equal sizes.
149
150     Copy the range [first, last), sort the copy, and divide the
151     sorted copy in Q parts. The parts are created such that the total
152     weight in a part is approximately W/Q where W is the total weight
153     (over all parts). The class calculates the average value in each
154     part and also the "quantile".
155  */
156  class Partitioner
157  {
158  public:
159    /**
160       \brief Create the partition and perform required calculations.
161    */
162    template<typename ForwardIterator>
163    Partitioner(ForwardIterator first, ForwardIterator last, 
164                unsigned int N);
165
166    /**
167       \brief Return the averages for each part.
168
169       \return The average vector.
170    */
171    const utility::Vector& averages(void) const;
172
173    /**
174       The quantile (here) is defined as (w_lower + w_upper) / 2W,
175       where w_lower is the total weight of elements smaller than the
176       smallest element in the part, and w_upper is the total weight
177       of elements smaller (or equal) than the largest value in the
178       part. 
179
180       In the unweighted case all weights are 1.0, which implies q_0 =
181       n_0/N, q_1 = (n_0+n_1/2)/N, q_2 = (n_0+n_1+n_2/2)/N where n_i
182       is number of elements in ith part.
183
184       \return The quantiles vector.
185    */
186    const utility::Vector& quantiles(void) const;
187
188    /**
189       \return The number of parts.
190    */
191    size_t size(void) const;
192
193  private:
194    // unweighted "constructor"
195    template<typename ForwardIterator>
196    void build(ForwardIterator first, ForwardIterator last, unsigned int N, 
197               utility::unweighted_iterator_tag);
198    // weighted "constructor"
199    template<typename ForwardIterator>
200    void build(ForwardIterator first, ForwardIterator last, unsigned int N, 
201               utility::weighted_iterator_tag);
202    void init(const utility::VectorBase&, unsigned int N);
203    void init(const std::vector<utility::DataWeight>&, unsigned int N);
204
205    utility::Vector average_;
206    utility::Vector quantiles_;
207  };
208
209    Partitioner target_;
210
211    // unweighted version
212    template<typename RandomAccessIterator1, typename RandomAccessIterator2>
213    void normalize(const Partitioner& source,RandomAccessIterator1 first, 
214                   RandomAccessIterator1 last, RandomAccessIterator2 result,
215                   utility::unweighted_iterator_tag tag) const;
216
217    // weighted version
218    template<typename RandomAccessIterator1, typename RandomAccessIterator2>
219    void normalize(const Partitioner& source,RandomAccessIterator1 first, 
220                   RandomAccessIterator1 last, RandomAccessIterator2 result,
221                   utility::weighted_iterator_tag tag) const;
222  };
223
224
225  // template implementations
226
227  template<typename ForwardIterator>
228  qQuantileNormalizer::qQuantileNormalizer(ForwardIterator first, 
229                                           ForwardIterator last,
230                                           unsigned int Q)
231    : target_(Partitioner(first, last, Q))
232  {
233    utility::yat_assert<std::runtime_error>(Q>2,
234                                            "qQuantileNormalizer: Q too small");
235  }
236
237
238  template<typename RandomAccessIterator1, typename RandomAccessIterator2>
239  void qQuantileNormalizer::operator()(RandomAccessIterator1 first, 
240                                       RandomAccessIterator1 last,
241                                       RandomAccessIterator2 result) const
242  {
243    Partitioner source(first, last, target_.size());
244    typename utility::weighted_iterator_traits<RandomAccessIterator2>::type tag;
245    normalize(source, first, last, result, tag);
246  }
247
248
249  template<typename RandomAccessIterator1, typename RandomAccessIterator2>
250  void 
251  qQuantileNormalizer::normalize(const qQuantileNormalizer::Partitioner& source,
252                                 RandomAccessIterator1 first, 
253                                 RandomAccessIterator1 last, 
254                                 RandomAccessIterator2 result,
255                                 utility::unweighted_iterator_tag tag) const
256  {
257    utility::check_iterator_is_unweighted(first);
258    utility::check_iterator_is_unweighted(result);
259    size_t N = last-first;
260    utility::yat_assert<std::runtime_error>
261      (N >= target_.size(), "qQuantileNormalizer: Input range too small");
262
263    std::vector<size_t> sorted_index(last-first);
264    utility::sort_index(first, last, sorted_index);
265
266    utility::Vector diff(source.averages());
267    diff-=target_.averages();
268    const utility::Vector& idx=target_.quantiles();
269    regression::CSplineInterpolation cspline(idx,diff);
270
271    // linear extrapolation for first part, i.e., use first diff for
272    // all points in the first part.
273    size_t start=0;
274    size_t end = static_cast<size_t>(std::ceil(N*idx(0) - 0.5));
275    // take care of limiting case number of parts approximately equal
276    // to the number of elements in range.
277    if (end==0)
278      ++end;
279    for (size_t i=start; i<end; ++i) {
280      size_t si = sorted_index[i];
281      result[si] = first[si] - diff(0);
282    }
283
284    using utility::yat_assert;
285   
286    // cspline interpolation for all data between the mid points of
287    // the first and last part
288    start=end;
289    end = static_cast<size_t>(std::ceil(N*idx(idx.size()-1) - 0.5));
290    if (end>=N)
291      end = N-1;
292    for ( size_t i=start; i<end; ++i) {
293      size_t si = sorted_index[i];
294     
295      yat_assert<std::runtime_error>((i+0.5)/N>idx(0), 
296                               "qQuantileNormalizer: invalid input to cspline");
297      result[si] = first[si] - cspline.evaluate((i+0.5)/N);
298    }
299   
300    // linear extrapolation for last part, i.e., use last diff for
301    // all points in the last part.
302    for (size_t i=end ; i<N; ++i) {
303      size_t si = sorted_index[i];
304      result[si] = first[si] - diff(diff.size()-1);
305    }
306  }
307
308
309  template<typename RandomAccessIterator1, typename RandomAccessIterator2>
310  void qQuantileNormalizer::normalize(const Partitioner& source,
311                                      RandomAccessIterator1 first, 
312                                      RandomAccessIterator1 last, 
313                                      RandomAccessIterator2 result,
314                                      utility::weighted_iterator_tag tag) const
315  {
316    // copy the weights
317    std::copy(utility::weight_iterator(first), 
318              utility::weight_iterator(last), 
319              utility::weight_iterator(result)); 
320
321    double total_w = std::accumulate(utility::weight_iterator(first),
322                                     utility::weight_iterator(last), 0.0);
323
324    std::vector<size_t> sorted_index(last-first);
325    // Code to avoid problems with NaN (ticket:535)
326    // utility::sort_index(utility::data_iterator(first),
327    //                     utility::data_iterator(last), sorted_index);
328    // ... above statement replaced below code block
329    {
330      using namespace utility;
331      std::vector<double> vec;
332      vec.reserve(std::distance(first, last));
333      std::copy(utility::data_iterator(first), utility::data_iterator(last), 
334                std::back_inserter(vec));
335      for (std::vector<double>::iterator i(vec.begin()); i!=vec.end(); ++i)
336        if (std::isnan(*i))
337          *i = std::numeric_limits<double>::infinity();
338      utility::sort_index(vec.begin(), vec.end(), sorted_index);
339    }
340    // end Code to avoid problems with NaN (ticket:535)
341
342    utility::Vector diff(source.averages());
343    diff-=target_.averages();
344    const utility::Vector& idx=target_.quantiles();
345    regression::CSplineInterpolation cspline(idx,diff);
346
347    double sum_w=0;
348    utility::iterator_traits<RandomAccessIterator1> traits1;
349    utility::iterator_traits<RandomAccessIterator2> traits2;
350    for (size_t i=0; i<sorted_index.size(); ++i) {
351      size_t si = sorted_index[i];
352      double w = (sum_w + 0.5*traits1.weight(first+si))/total_w;
353      double correction = 0;
354      if (w <= idx(0)) {
355        // linear extrapolation for first part, i.e., use first diff for
356        // all points in the first part.
357        correction = diff(0);
358      }
359      else if (w < idx(idx.size()-1) ) {
360        // cspline interpolation for all data between the mid points of
361        // the first and last part
362        correction = cspline.evaluate(w);
363      }
364      else {
365        // linear extrapolation for last part, i.e., use last diff for
366        // all points in the last part.
367        correction = diff(diff.size()-1);
368      }
369      traits2.data(result+si) = traits1.data(first+si) - correction;
370      sum_w += traits1.weight(first+si);
371    }
372  }
373
374
375  template<typename ForwardIterator>
376  qQuantileNormalizer::Partitioner::Partitioner(ForwardIterator first, 
377                                                ForwardIterator last, 
378                                                unsigned int N)
379    : average_(utility::Vector(N)), quantiles_(utility::Vector(N))
380  {
381    build(first, last, N, 
382          typename utility::weighted_iterator_traits<ForwardIterator>::type());
383  }
384
385
386  template<typename ForwardIterator>
387  void qQuantileNormalizer::Partitioner::build(ForwardIterator first, 
388                                               ForwardIterator last, 
389                                               unsigned int N, 
390                                               utility::unweighted_iterator_tag)
391  {
392    utility::Vector vec(std::distance(first, last));
393    std::copy(first, last, vec.begin());
394    std::sort(vec.begin(), vec.end());
395    init(vec, N);
396  }
397
398
399  template<typename ForwardIterator>
400  void qQuantileNormalizer::Partitioner::build(ForwardIterator first, 
401                                               ForwardIterator last, 
402                                               unsigned int N, 
403                                               utility::weighted_iterator_tag)
404  {
405    using namespace utility;
406    std::vector<DataWeight> vec;
407    vec.reserve(std::distance(first, last));
408    std::back_insert_iterator<std::vector<DataWeight> > inserter(vec);
409    std::copy(first, last, inserter);
410    std::sort(vec.begin(), vec.end());
411    init(vec, N);
412  }
413
414
415}}} // end of namespace normalizer, yat and thep
416
417#endif
Note: See TracBrowser for help on using the repository browser.