source: trunk/yat/normalizer/qQuantileNormalizer.h @ 3550

Last change on this file since 3550 was 3550, checked in by Peter, 6 years ago

Update copyright years. Happy New Year

  • Property svn:eol-style set to native
  • Property svn:keywords set to Id
File size: 14.4 KB
Line 
1#ifndef _theplu_yat_normalizer_qquantile_normalizer_
2#define _theplu_yat_normalizer_qquantile_normalizer_
3
4/*
5  Copyright (C) 2009 Jari Häkkinen, Peter Johansson
6  Copyright (C) 2010, 2016 Peter Johansson
7
8  This file is part of the yat library, http://dev.thep.lu.se/yat
9
10  The yat library is free software; you can redistribute it and/or
11  modify it under the terms of the GNU General Public License as
12  published by the Free Software Foundation; either version 3 of the
13  License, or (at your option) any later version.
14
15  The yat library is distributed in the hope that it will be useful,
16  but WITHOUT ANY WARRANTY; without even the implied warranty of
17  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18  General Public License for more details.
19
20  You should have received a copy of the GNU General Public License
21  along with yat. If not, see <http://www.gnu.org/licenses/>.
22*/
23
24#include "utility.h"
25
26#include "yat/regression/CSplineInterpolation.h"
27#include "yat/utility/concept_check.h"
28#include "yat/utility/DataIterator.h"
29#include "yat/utility/DataWeight.h"
30#include "yat/utility/Exception.h"
31#include "yat/utility/iterator_traits.h"
32#include "yat/utility/sort_index.h"
33#include "yat/utility/Vector.h"
34#include "yat/utility/WeightIterator.h"
35#include "yat/utility/yat_assert.h"
36
37#include <boost/concept_check.hpp>
38
39#include <algorithm>
40#include <cmath>
41#include <iterator>
42#include <limits>
43#include <numeric>
44#include <stdexcept>
45#include <vector>
46
47namespace theplu {
48namespace yat {
49namespace utility {
50  class VectorBase;
51}
52namespace normalizer {
53
54  /**
55     \brief Perform Q-quantile normalization
56
57     Perform a Q-quantile normalization on a \a source range, after
58     which it will approximately have the same distribution of data as
59     the \a target range (the Q-quantiles are the same). The rank of
60     an element in the \a source range is not changed.
61
62     The class works also with unweighed ranges, and there is no
63     restriction that weighted \a source range requires weighted \a
64     target range or vice versa.
65
66     Normalization goes like this:
67     - Data are not assumed to be sorted.
68     - Partition sorted \a target data in Q parts. Q must be 3 or larger
69       because of requirements from the underlying cubic spline fit
70     - Calculate the arithmetic (weighted) mean for each part, the mean is
71       assigned to the mid point of each part.
72     - Do the above for the data to be transformed (called \a source
73       here).
74     - For each part, calculate the difference between the \a target
75       and \a the source. Now we have \a Q differences \f$ d_i \f$
76       with associated rank (midpoint of each part).
77     - Create a cubic spline fit to this difference vector \a d. The
78       resulting curve is used to recalculate all values in \a source.
79       - Use the cubic spline fit for values within the cubic spline
80         fit range [midpoint 1st part, midpoint last part].
81       - For data outside the cubic spline fit use linear
82         extrapolation, i.e., a constant shift. \f$ d_{first} \f$ for
83         points below fit range, and \f$ d_{last} \f$ for points above fit
84         range.
85
86     \since New in yat 0.5
87   */
88  class qQuantileNormalizer
89  {
90  public:
91    /**
92       \brief Constructor
93
94       Divides a sorted copy of range [\a first,\a last) into \a Q
95       parts. Parts are divided such that the sum of weights is
96       approximately the same in the different parts. If a relative
97       weight, \f$ w_i / \sum w_i \f$, is larger than 1/Q this might
98       be difficult to achieve, in which case a an exception is
99       thrown. In the unweighted case this implies that \a Q should be
100       smaller (or equal) than number of elements in [\a first, \a
101       last).
102
103       The range supplied to the constructor sets the target
104       distribution.
105
106       As the \a source range is also divided into \a Q parts (when
107       operator() is called), it is recommended to keep \a Q smaller
108       (or equal) than the size of ranges that will be normalized.
109
110       Also, \a Q must not be smaller than 3 due to restrictions in
111       the cubic spline fit utilized in the normalization.
112
113       \b Type \b Requirements:
114       - \c ForwardIterator is a model of \forward_traversal_iterator
115       - \c ForwardIterator is a \ref concept_data_iterator
116    */
117    template<typename ForwardIterator>
118    qQuantileNormalizer(ForwardIterator first, ForwardIterator last,
119                        unsigned int Q);
120
121    /**
122       \brief Perform the Q-quantile normalization.
123
124       Elements in [\a first, \a last) are normalized as described
125       above and the result is assigned to [\a result, \a result + \a
126       last-\a first). Input range [\a first, \a last) is not
127       modified. If ranges are weighted, the weights are copied from
128       [\a first, \a last) to \a result range.
129
130       It is possible to normalize "in place"; it is permissible for
131       \a first and \a result to be the same. However, as assignment
132       occurs sequentially, the operation is undefined if \a result is
133       the same as any in range [\a first, \a last).
134
135       \b Type Requirements:
136       - \c RandomAccessIterator1 is a \ref concept_data_iterator
137       - \c RandomAccessIterator1 is a \random_access_traversal_iterator
138       - \c RandomAccessIterator2 is a \ref concept_data_iterator
139       - \c RandomAccessIterator2 is a \random_access_traversal_iterator
140       - \c RandomAccessIterator2 is a \writable_iterator
141
142     */
143    template<typename RandomAccessIterator1, typename RandomAccessIterator2>
144    void operator()(RandomAccessIterator1 first, RandomAccessIterator1 last,
145                    RandomAccessIterator2 result) const;
146
147  private:
148
149  /**
150     \brief Partition a range of data into equal sizes.
151
152     Copy the range [first, last), sort the copy, and divide the
153     sorted copy in Q parts. The parts are created such that the total
154     weight in a part is approximately W/Q where W is the total weight
155     (over all parts). The class calculates the average value in each
156     part and also the "quantile".
157  */
158  class Partitioner
159  {
160  public:
161    /**
162       \brief Create the partition and perform required calculations.
163    */
164    template<typename ForwardIterator>
165    Partitioner(ForwardIterator first, ForwardIterator last,
166                unsigned int N);
167
168    /**
169       \brief Return the averages for each part.
170
171       \return The average vector.
172    */
173    const utility::Vector& averages(void) const;
174
175    /**
176       The quantile (here) is defined as (w_lower + w_upper) / 2W,
177       where w_lower is the total weight of elements smaller than the
178       smallest element in the part, and w_upper is the total weight
179       of elements smaller (or equal) than the largest value in the
180       part.
181
182       In the unweighted case all weights are 1.0, which implies q_0 =
183       n_0/N, q_1 = (n_0+n_1/2)/N, q_2 = (n_0+n_1+n_2/2)/N where n_i
184       is number of elements in ith part.
185
186       \return The quantiles vector.
187    */
188    const utility::Vector& quantiles(void) const;
189
190    /**
191       \return The number of parts.
192    */
193    size_t size(void) const;
194
195  private:
196    // unweighted "constructor"
197    template<typename ForwardIterator>
198    void build(ForwardIterator first, ForwardIterator last, unsigned int N,
199               utility::unweighted_iterator_tag);
200    // weighted "constructor"
201    template<typename ForwardIterator>
202    void build(ForwardIterator first, ForwardIterator last, unsigned int N,
203               utility::weighted_iterator_tag);
204    void init(const utility::VectorBase&, unsigned int N);
205    void init(const std::vector<utility::DataWeight>&, unsigned int N);
206
207    utility::Vector average_;
208    utility::Vector quantiles_;
209  };
210
211    Partitioner target_;
212
213    // unweighted version
214    template<typename RandomAccessIterator1, typename RandomAccessIterator2>
215    void normalize(const Partitioner& source,RandomAccessIterator1 first,
216                   RandomAccessIterator1 last, RandomAccessIterator2 result,
217                   utility::unweighted_iterator_tag tag) const;
218
219    // weighted version
220    template<typename RandomAccessIterator1, typename RandomAccessIterator2>
221    void normalize(const Partitioner& source,RandomAccessIterator1 first,
222                   RandomAccessIterator1 last, RandomAccessIterator2 result,
223                   utility::weighted_iterator_tag tag) const;
224  };
225
226
227  // template implementations
228
229  template<typename ForwardIterator>
230  qQuantileNormalizer::qQuantileNormalizer(ForwardIterator first,
231                                           ForwardIterator last,
232                                           unsigned int Q)
233    : target_(Partitioner(first, last, Q))
234  {
235    YAT_ASSERT(Q>2);
236  }
237
238
239  template<typename RandomAccessIterator1, typename RandomAccessIterator2>
240  void qQuantileNormalizer::operator()(RandomAccessIterator1 first,
241                                       RandomAccessIterator1 last,
242                                       RandomAccessIterator2 result) const
243  {
244    BOOST_CONCEPT_ASSERT((utility::DataIterator<RandomAccessIterator1>));
245    using boost_concepts::RandomAccessTraversal;
246    BOOST_CONCEPT_ASSERT((RandomAccessTraversal<RandomAccessIterator1>));
247    BOOST_CONCEPT_ASSERT((utility::DataIterator<RandomAccessIterator2>));
248    BOOST_CONCEPT_ASSERT((RandomAccessTraversal<RandomAccessIterator2>));
249    using boost_concepts::WritableIterator;
250    BOOST_CONCEPT_ASSERT((WritableIterator<RandomAccessIterator2>));
251
252    Partitioner source(first, last, target_.size());
253    typename utility::weighted_if_any2<RandomAccessIterator1, RandomAccessIterator2>::type tag;
254    normalize(source, first, last, result, tag);
255  }
256
257
258  template<typename RandomAccessIterator1, typename RandomAccessIterator2>
259  void
260  qQuantileNormalizer::normalize(const qQuantileNormalizer::Partitioner& source,
261                                 RandomAccessIterator1 first,
262                                 RandomAccessIterator1 last,
263                                 RandomAccessIterator2 result,
264                                 utility::unweighted_iterator_tag tag) const
265  {
266    utility::check_iterator_is_unweighted(first);
267    utility::check_iterator_is_unweighted(result);
268    size_t N = last-first;
269    YAT_ASSERT(N >= target_.size());
270
271    std::vector<size_t> sorted_index(last-first);
272    utility::sort_index(first, last, sorted_index);
273
274    utility::Vector diff(source.averages());
275    diff-=target_.averages();
276    const utility::Vector& idx=target_.quantiles();
277    regression::CSplineInterpolation cspline(idx,diff);
278
279    // linear extrapolation for first part, i.e., use first diff for
280    // all points in the first part.
281    size_t start=0;
282    size_t end = static_cast<size_t>(std::ceil(N*idx(0) - 0.5));
283    // take care of limiting case number of parts approximately equal
284    // to the number of elements in range.
285    if (end==0)
286      ++end;
287    for (size_t i=start; i<end; ++i) {
288      size_t si = sorted_index[i];
289      result[si] = first[si] - diff(0);
290    }
291
292    using utility::yat_assert;
293
294    // cspline interpolation for all data between the mid points of
295    // the first and last part
296    start=end;
297    end = static_cast<size_t>(std::ceil(N*idx(idx.size()-1) - 0.5));
298    if (end>=N)
299      end = N-1;
300    for ( size_t i=start; i<end; ++i) {
301      size_t si = sorted_index[i];
302
303      YAT_ASSERT((i+0.5)/N>idx(0));
304      result[si] = first[si] - cspline.evaluate((i+0.5)/N);
305    }
306
307    // linear extrapolation for last part, i.e., use last diff for
308    // all points in the last part.
309    for (size_t i=end ; i<N; ++i) {
310      size_t si = sorted_index[i];
311      result[si] = first[si] - diff(diff.size()-1);
312    }
313  }
314
315
316  template<typename RandomAccessIterator1, typename RandomAccessIterator2>
317  void qQuantileNormalizer::normalize(const Partitioner& source,
318                                      RandomAccessIterator1 first,
319                                      RandomAccessIterator1 last,
320                                      RandomAccessIterator2 result,
321                                      utility::weighted_iterator_tag tag) const
322  {
323    // copy the weights
324    detail::copy_weight_if_weighted(first, last, result);
325
326    double total_w = std::accumulate(utility::weight_iterator(first),
327                                     utility::weight_iterator(last), 0.0);
328
329    std::vector<size_t> sorted_index(last-first);
330    // Code to avoid problems with NaN (ticket:535)
331    // utility::sort_index(utility::data_iterator(first),
332    //                     utility::data_iterator(last), sorted_index);
333    // ... above statement replaced below code block
334    {
335      using namespace utility;
336      std::vector<double> vec;
337      vec.reserve(std::distance(first, last));
338      std::copy(utility::data_iterator(first), utility::data_iterator(last),
339                std::back_inserter(vec));
340      for (std::vector<double>::iterator i(vec.begin()); i!=vec.end(); ++i)
341        if (std::isnan(*i))
342          *i = std::numeric_limits<double>::infinity();
343      utility::sort_index(vec.begin(), vec.end(), sorted_index);
344    }
345    // end Code to avoid problems with NaN (ticket:535)
346
347    utility::Vector diff(source.averages());
348    diff-=target_.averages();
349    const utility::Vector& idx=target_.quantiles();
350    regression::CSplineInterpolation cspline(idx,diff);
351
352    double sum_w=0;
353    utility::iterator_traits<RandomAccessIterator1> traits1;
354    utility::iterator_traits<RandomAccessIterator2> traits2;
355    for (size_t i=0; i<sorted_index.size(); ++i) {
356      size_t si = sorted_index[i];
357      double w = (sum_w + 0.5*traits1.weight(first+si))/total_w;
358      double correction = 0;
359      if (w <= idx(0)) {
360        // linear extrapolation for first part, i.e., use first diff for
361        // all points in the first part.
362        correction = diff(0);
363      }
364      else if (w < idx(idx.size()-1) ) {
365        // cspline interpolation for all data between the mid points of
366        // the first and last part
367        correction = cspline.evaluate(w);
368      }
369      else {
370        // linear extrapolation for last part, i.e., use last diff for
371        // all points in the last part.
372        correction = diff(diff.size()-1);
373      }
374      traits2.data(result+si) = traits1.data(first+si) - correction;
375      sum_w += traits1.weight(first+si);
376    }
377  }
378
379
380  template<typename ForwardIterator>
381  qQuantileNormalizer::Partitioner::Partitioner(ForwardIterator first,
382                                                ForwardIterator last,
383                                                unsigned int N)
384    : average_(utility::Vector(N)), quantiles_(utility::Vector(N))
385  {
386    BOOST_CONCEPT_ASSERT((boost_concepts::ForwardTraversal<ForwardIterator>));
387    BOOST_CONCEPT_ASSERT((utility::DataIterator<ForwardIterator>));
388    build(first, last, N,
389          typename utility::weighted_iterator_traits<ForwardIterator>::type());
390  }
391
392
393  template<typename ForwardIterator>
394  void qQuantileNormalizer::Partitioner::build(ForwardIterator first,
395                                               ForwardIterator last,
396                                               unsigned int N,
397                                               utility::unweighted_iterator_tag)
398  {
399    utility::Vector vec(std::distance(first, last));
400    std::copy(first, last, vec.begin());
401    std::sort(vec.begin(), vec.end());
402    init(vec, N);
403  }
404
405
406  template<typename ForwardIterator>
407  void qQuantileNormalizer::Partitioner::build(ForwardIterator first,
408                                               ForwardIterator last,
409                                               unsigned int N,
410                                               utility::weighted_iterator_tag)
411  {
412    using namespace utility;
413    std::vector<DataWeight> vec(first, last);
414    std::sort(vec.begin(), vec.end());
415    init(vec, N);
416  }
417
418
419}}} // end of namespace normalizer, yat and thep
420
421#endif
Note: See TracBrowser for help on using the repository browser.