source: trunk/yat/normalizer/qQuantileNormalizer.h @ 3546

Last change on this file since 3546 was 3546, checked in by Peter, 6 years ago

prefer using constructor rather empty construction + std::copy

  • Property svn:eol-style set to native
  • Property svn:keywords set to Id
File size: 14.6 KB
Line 
1#ifndef _theplu_yat_normalizer_qquantile_normalizer_
2#define _theplu_yat_normalizer_qquantile_normalizer_
3
4/*
5  Copyright (C) 2009 Jari Häkkinen, Peter Johansson
6  Copyright (C) 2010 Peter Johansson
7
8  This file is part of the yat library, http://dev.thep.lu.se/yat
9
10  The yat library is free software; you can redistribute it and/or
11  modify it under the terms of the GNU General Public License as
12  published by the Free Software Foundation; either version 3 of the
13  License, or (at your option) any later version.
14
15  The yat library is distributed in the hope that it will be useful,
16  but WITHOUT ANY WARRANTY; without even the implied warranty of
17  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18  General Public License for more details.
19
20  You should have received a copy of the GNU General Public License
21  along with yat. If not, see <http://www.gnu.org/licenses/>.
22*/
23
24#include "utility.h"
25
26#include "yat/regression/CSplineInterpolation.h"
27#include "yat/utility/concept_check.h"
28#include "yat/utility/DataIterator.h"
29#include "yat/utility/DataWeight.h"
30#include "yat/utility/Exception.h"
31#include "yat/utility/iterator_traits.h"
32#include "yat/utility/sort_index.h"
33#include "yat/utility/Vector.h"
34#include "yat/utility/WeightIterator.h"
35#include "yat/utility/yat_assert.h"
36
37#include <boost/concept_check.hpp>
38
39#include <algorithm>
40#include <cmath>
41#include <iterator>
42#include <limits>
43#include <numeric>
44#include <stdexcept>
45#include <vector>
46
47namespace theplu {
48namespace yat {
49namespace utility {
50  class VectorBase;
51}
52namespace normalizer {
53
54  /**
55     \brief Perform Q-quantile normalization
56
57     Perform a Q-quantile normalization on a \a source range, after
58     which it will approximately have the same distribution of data as
59     the \a target range (the Q-quantiles are the same). The rank of
60     an element in the \a source range is not changed.
61
62     The class works also with unweighed ranges, and there is no
63     restriction that weighted \a source range requires weighted \a
64     target range or vice versa.
65
66     Normalization goes like this:
67     - Data are not assumed to be sorted.
68     - Partition sorted \a target data in Q parts. Q must be 3 or larger
69       because of requirements from the underlying cubic spline fit
70     - Calculate the arithmetic (weighted) mean for each part, the mean is
71       assigned to the mid point of each part.
72     - Do the above for the data to be transformed (called \a source
73       here).
74     - For each part, calculate the difference between the \a target
75       and \a the source. Now we have \a Q differences \f$ d_i \f$
76       with associated rank (midpoint of each part).
77     - Create a cubic spline fit to this difference vector \a d. The
78       resulting curve is used to recalculate all values in \a source.
79       - Use the cubic spline fit for values within the cubic spline
80         fit range [midpoint 1st part, midpoint last part].
81       - For data outside the cubic spline fit use linear
82         extrapolation, i.e., a constant shift. \f$ d_{first} \f$ for
83         points below fit range, and \f$ d_{last} \f$ for points above fit
84         range.
85
86     \since New in yat 0.5
87   */
88  class qQuantileNormalizer
89  {
90  public:
91    /**
92       \brief Constructor
93
94       Divides a sorted copy of range [\a first,\a last) into \a Q
95       parts. Parts are divided such that the sum of weights is
96       approximately the same in the different parts. If a relative
97       weight, \f$ w_i / \sum w_i \f$, is larger than 1/Q this might
98       be difficult to achieve, in which case a an exception is
99       thrown. In the unweighted case this implies that \a Q should be
100       smaller (or equal) than number of elements in [\a first, \a
101       last).
102
103       The range supplied to the constructor sets the target
104       distribution.
105
106       As the \a source range is also divided into \a Q parts (when
107       operator() is called), it is recommended to keep \a Q smaller
108       (or equal) than the size of ranges that will be normalized.
109
110       Also, \a Q must not be smaller than 3 due to restrictions in
111       the cubic spline fit utilized in the normalization.
112
113       \b Type \b Requirements:
114       - \c ForwardIterator is a model of \forward_iterator.
115       - \c ForwardIterator is a \ref concept_data_iterator
116    */
117    template<typename ForwardIterator>
118    qQuantileNormalizer(ForwardIterator first, ForwardIterator last,
119                        unsigned int Q);
120
121    /**
122       \brief Perform the Q-quantile normalization.
123
124       Elements in [\a first, \a last) are normalized as described
125       above and the result is assigned to [\a result, \a result + \a
126       last-\a first). Input range [\a first, \a last) is not
127       modified. If ranges are weighted, the weights are copied from
128       [\a first, \a last) to \a result range.
129
130       It is possible to normalize "in place"; it is permissible for
131       \a first and \a result to be the same. However, as assignment
132       occurs sequentially, the operation is undefined if \a result is
133       the same as any in range [\a first, \a last).
134
135       \b Type Requirements:
136       - \c RandomAccessIterator1 is a model of \random_access_iterator
137       - \c RandomAccessIterator1 is a \ref concept_data_iterator
138       - \c RandomAccessIterator1's value type is convertible to
139         \c RandomAccessIterator2's value type
140       - \c RandomAccessIterator2 is a model of \random_access_iterator
141       - \c RandomAccessIterator2 is mutable.
142
143     */
144    template<typename RandomAccessIterator1, typename RandomAccessIterator2>
145    void operator()(RandomAccessIterator1 first, RandomAccessIterator1 last,
146                    RandomAccessIterator2 result) const;
147
148  private:
149
150  /**
151     \brief Partition a range of data into equal sizes.
152
153     Copy the range [first, last), sort the copy, and divide the
154     sorted copy in Q parts. The parts are created such that the total
155     weight in a part is approximately W/Q where W is the total weight
156     (over all parts). The class calculates the average value in each
157     part and also the "quantile".
158  */
159  class Partitioner
160  {
161  public:
162    /**
163       \brief Create the partition and perform required calculations.
164    */
165    template<typename ForwardIterator>
166    Partitioner(ForwardIterator first, ForwardIterator last,
167                unsigned int N);
168
169    /**
170       \brief Return the averages for each part.
171
172       \return The average vector.
173    */
174    const utility::Vector& averages(void) const;
175
176    /**
177       The quantile (here) is defined as (w_lower + w_upper) / 2W,
178       where w_lower is the total weight of elements smaller than the
179       smallest element in the part, and w_upper is the total weight
180       of elements smaller (or equal) than the largest value in the
181       part.
182
183       In the unweighted case all weights are 1.0, which implies q_0 =
184       n_0/N, q_1 = (n_0+n_1/2)/N, q_2 = (n_0+n_1+n_2/2)/N where n_i
185       is number of elements in ith part.
186
187       \return The quantiles vector.
188    */
189    const utility::Vector& quantiles(void) const;
190
191    /**
192       \return The number of parts.
193    */
194    size_t size(void) const;
195
196  private:
197    // unweighted "constructor"
198    template<typename ForwardIterator>
199    void build(ForwardIterator first, ForwardIterator last, unsigned int N,
200               utility::unweighted_iterator_tag);
201    // weighted "constructor"
202    template<typename ForwardIterator>
203    void build(ForwardIterator first, ForwardIterator last, unsigned int N,
204               utility::weighted_iterator_tag);
205    void init(const utility::VectorBase&, unsigned int N);
206    void init(const std::vector<utility::DataWeight>&, unsigned int N);
207
208    utility::Vector average_;
209    utility::Vector quantiles_;
210  };
211
212    Partitioner target_;
213
214    // unweighted version
215    template<typename RandomAccessIterator1, typename RandomAccessIterator2>
216    void normalize(const Partitioner& source,RandomAccessIterator1 first,
217                   RandomAccessIterator1 last, RandomAccessIterator2 result,
218                   utility::unweighted_iterator_tag tag) const;
219
220    // weighted version
221    template<typename RandomAccessIterator1, typename RandomAccessIterator2>
222    void normalize(const Partitioner& source,RandomAccessIterator1 first,
223                   RandomAccessIterator1 last, RandomAccessIterator2 result,
224                   utility::weighted_iterator_tag tag) const;
225  };
226
227
228  // template implementations
229
230  template<typename ForwardIterator>
231  qQuantileNormalizer::qQuantileNormalizer(ForwardIterator first,
232                                           ForwardIterator last,
233                                           unsigned int Q)
234    : target_(Partitioner(first, last, Q))
235  {
236    YAT_ASSERT(Q>2);
237  }
238
239
240  template<typename RandomAccessIterator1, typename RandomAccessIterator2>
241  void qQuantileNormalizer::operator()(RandomAccessIterator1 first,
242                                       RandomAccessIterator1 last,
243                                       RandomAccessIterator2 result) const
244  {
245    Partitioner source(first, last, target_.size());
246    typename utility::weighted_if_any2<RandomAccessIterator1, RandomAccessIterator2>::type tag;
247    normalize(source, first, last, result, tag);
248  }
249
250
251  template<typename RandomAccessIterator1, typename RandomAccessIterator2>
252  void
253  qQuantileNormalizer::normalize(const qQuantileNormalizer::Partitioner& source,
254                                 RandomAccessIterator1 first,
255                                 RandomAccessIterator1 last,
256                                 RandomAccessIterator2 result,
257                                 utility::unweighted_iterator_tag tag) const
258  {
259    BOOST_CONCEPT_ASSERT((boost::RandomAccessIterator<RandomAccessIterator1>));
260    BOOST_CONCEPT_ASSERT((boost::Mutable_RandomAccessIterator<RandomAccessIterator2>));
261    BOOST_CONCEPT_ASSERT((utility::DataIteratorConcept<RandomAccessIterator1>));
262    BOOST_CONCEPT_ASSERT((utility::DataIteratorConcept<RandomAccessIterator2>));
263    utility::check_iterator_is_unweighted(first);
264    utility::check_iterator_is_unweighted(result);
265    size_t N = last-first;
266    YAT_ASSERT(N >= target_.size());
267
268    std::vector<size_t> sorted_index(last-first);
269    utility::sort_index(first, last, sorted_index);
270
271    utility::Vector diff(source.averages());
272    diff-=target_.averages();
273    const utility::Vector& idx=target_.quantiles();
274    regression::CSplineInterpolation cspline(idx,diff);
275
276    // linear extrapolation for first part, i.e., use first diff for
277    // all points in the first part.
278    size_t start=0;
279    size_t end = static_cast<size_t>(std::ceil(N*idx(0) - 0.5));
280    // take care of limiting case number of parts approximately equal
281    // to the number of elements in range.
282    if (end==0)
283      ++end;
284    for (size_t i=start; i<end; ++i) {
285      size_t si = sorted_index[i];
286      result[si] = first[si] - diff(0);
287    }
288
289    using utility::yat_assert;
290
291    // cspline interpolation for all data between the mid points of
292    // the first and last part
293    start=end;
294    end = static_cast<size_t>(std::ceil(N*idx(idx.size()-1) - 0.5));
295    if (end>=N)
296      end = N-1;
297    for ( size_t i=start; i<end; ++i) {
298      size_t si = sorted_index[i];
299
300      YAT_ASSERT((i+0.5)/N>idx(0));
301      result[si] = first[si] - cspline.evaluate((i+0.5)/N);
302    }
303
304    // linear extrapolation for last part, i.e., use last diff for
305    // all points in the last part.
306    for (size_t i=end ; i<N; ++i) {
307      size_t si = sorted_index[i];
308      result[si] = first[si] - diff(diff.size()-1);
309    }
310  }
311
312
313  template<typename RandomAccessIterator1, typename RandomAccessIterator2>
314  void qQuantileNormalizer::normalize(const Partitioner& source,
315                                      RandomAccessIterator1 first,
316                                      RandomAccessIterator1 last,
317                                      RandomAccessIterator2 result,
318                                      utility::weighted_iterator_tag tag) const
319  {
320    BOOST_CONCEPT_ASSERT((boost::RandomAccessIterator<RandomAccessIterator1>));
321    BOOST_CONCEPT_ASSERT((boost::Mutable_RandomAccessIterator<RandomAccessIterator2>));
322    BOOST_CONCEPT_ASSERT((utility::DataIterator<RandomAccessIterator1>));
323    BOOST_CONCEPT_ASSERT((utility::DataIterator<RandomAccessIterator2>));
324    // copy the weights
325    detail::copy_weight_if_weighted(first, last, result);
326
327    double total_w = std::accumulate(utility::weight_iterator(first),
328                                     utility::weight_iterator(last), 0.0);
329
330    std::vector<size_t> sorted_index(last-first);
331    // Code to avoid problems with NaN (ticket:535)
332    // utility::sort_index(utility::data_iterator(first),
333    //                     utility::data_iterator(last), sorted_index);
334    // ... above statement replaced below code block
335    {
336      using namespace utility;
337      std::vector<double> vec;
338      vec.reserve(std::distance(first, last));
339      std::copy(utility::data_iterator(first), utility::data_iterator(last),
340                std::back_inserter(vec));
341      for (std::vector<double>::iterator i(vec.begin()); i!=vec.end(); ++i)
342        if (std::isnan(*i))
343          *i = std::numeric_limits<double>::infinity();
344      utility::sort_index(vec.begin(), vec.end(), sorted_index);
345    }
346    // end Code to avoid problems with NaN (ticket:535)
347
348    utility::Vector diff(source.averages());
349    diff-=target_.averages();
350    const utility::Vector& idx=target_.quantiles();
351    regression::CSplineInterpolation cspline(idx,diff);
352
353    double sum_w=0;
354    utility::iterator_traits<RandomAccessIterator1> traits1;
355    utility::iterator_traits<RandomAccessIterator2> traits2;
356    for (size_t i=0; i<sorted_index.size(); ++i) {
357      size_t si = sorted_index[i];
358      double w = (sum_w + 0.5*traits1.weight(first+si))/total_w;
359      double correction = 0;
360      if (w <= idx(0)) {
361        // linear extrapolation for first part, i.e., use first diff for
362        // all points in the first part.
363        correction = diff(0);
364      }
365      else if (w < idx(idx.size()-1) ) {
366        // cspline interpolation for all data between the mid points of
367        // the first and last part
368        correction = cspline.evaluate(w);
369      }
370      else {
371        // linear extrapolation for last part, i.e., use last diff for
372        // all points in the last part.
373        correction = diff(diff.size()-1);
374      }
375      traits2.data(result+si) = traits1.data(first+si) - correction;
376      sum_w += traits1.weight(first+si);
377    }
378  }
379
380
381  template<typename ForwardIterator>
382  qQuantileNormalizer::Partitioner::Partitioner(ForwardIterator first,
383                                                ForwardIterator last,
384                                                unsigned int N)
385    : average_(utility::Vector(N)), quantiles_(utility::Vector(N))
386  {
387    BOOST_CONCEPT_ASSERT((boost::ForwardIterator<ForwardIterator>));
388    BOOST_CONCEPT_ASSERT((utility::DataIterator<ForwardIterator>));
389    build(first, last, N,
390          typename utility::weighted_iterator_traits<ForwardIterator>::type());
391  }
392
393
394  template<typename ForwardIterator>
395  void qQuantileNormalizer::Partitioner::build(ForwardIterator first,
396                                               ForwardIterator last,
397                                               unsigned int N,
398                                               utility::unweighted_iterator_tag)
399  {
400    utility::Vector vec(std::distance(first, last));
401    std::copy(first, last, vec.begin());
402    std::sort(vec.begin(), vec.end());
403    init(vec, N);
404  }
405
406
407  template<typename ForwardIterator>
408  void qQuantileNormalizer::Partitioner::build(ForwardIterator first,
409                                               ForwardIterator last,
410                                               unsigned int N,
411                                               utility::weighted_iterator_tag)
412  {
413    using namespace utility;
414    std::vector<DataWeight> vec(first, last);
415    std::sort(vec.begin(), vec.end());
416    init(vec, N);
417  }
418
419
420}}} // end of namespace normalizer, yat and thep
421
422#endif
Note: See TracBrowser for help on using the repository browser.